# 2.0 Data Cleaning

In [1]:
# libraries
import os
import sys
import inspect
sys.path.append("../src")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from shared import directories
from shared import filenames
from shared import variables

sys.path.append(directories.ANALYSIS_DIR)
import description
pd.set_option('display.max_rows', 500)


## 2.1 Obtain raw data

In [2]:
df = pd.read_csv(os.path.join(directories.RAW_DATA_DIR,
                              filenames.RAW_FILENAME),
                 encoding="Latin-1", low_memory=False)

## 2.2 Categorical Data

### 2.2.1 Audit

In [3]:
df_cat = df.select_dtypes(include=['object'])
description.describe_qual_df(df_cat)

Unnamed: 0,count,unique,top,freq,missing
field,8315,259,Business,521,63
undergra,4914,241,UC Berkeley,107,3464
mn_sat,3133,68,1400.00,403,5245
tuition,3583,115,26908.00,241,4795
from,8299,269,New York,522,79
zipcode,7314,409,0,355,1064
income,4279,261,55080.00,124,4099
career,8289,367,Finance,202,89


### 2.2.2 Categorical Data Processing Plan
The following variables must be converted to numeric:
1. mn_sat 
2. tuition
3. income 

In terms of missing data:
1. Field and career are 'form filled' and are subject to misspellings and omission. The field_cd and career_c fields will
be used instead.
2. There are no other opportunities for data imputation for the categorical variables.

#### 2.2.2.1 Convert categorical numbers to numeric

In [4]:
df['mn_sat'] = df['mn_sat'].str.replace(",","").astype(float)
df['income'] = df['income'].str.replace(",","").astype(float)
df['tuition'] = df['tuition'].str.replace(",","").astype(float)

## 2.3 Quantitative Data

In [5]:
df_num = df.select_dtypes(include=['int', 'int64', 'float64'])
df_num_desc = description.describe_quant_df(df_num)
df_num_missing = df_num_desc.loc[df_num_desc.missing > 0]
df_num_missing











Unnamed: 0,count,missing,min,25%,mean,50%,75%,max,sd,skew,kurtosis,normality_p,normality
id,8377.0,1,1.0,4.0,8.960248,8.0,13.0,22.0,0.010925,-91.515027,8373.000119,0.0,Reject H0
positin1,6532.0,1846,1.0,4.0,9.295775,9.0,14.0,22.0,0.414475,-1.34947,-0.17893,0.0,Reject H0
pid,8368.0,10,1.0,154.0,283.863767,281.0,408.0,552.0,0.034528,-28.892926,832.801195,0.0,Reject H0
int_corr,8220.0,158,-0.83,-0.02,0.19601,0.21,0.43,0.91,0.136027,-7.074216,48.044538,0.0,Reject H0
age_o,8274.0,104,18.0,24.0,26.364999,26.0,28.0,55.0,0.110722,-8.807398,75.570262,0.0,Reject H0
race_o,8305.0,73,1.0,2.0,2.756653,2.0,4.0,6.0,0.092937,-10.572413,109.775913,0.0,Reject H0
pf_o_att,8289.0,89,0.0,15.0,22.495347,20.0,25.0,100.0,0.102519,-9.547019,89.145569,0.0,Reject H0
pf_o_sin,8289.0,89,0.0,15.0,17.396867,18.37,20.0,60.0,0.102519,-9.547019,89.145569,0.0,Reject H0
pf_o_int,8289.0,89,0.0,17.39,20.270759,20.0,23.81,50.0,0.102519,-9.547019,89.145569,0.0,Reject H0
pf_o_fun,8280.0,98,0.0,15.0,17.459714,18.0,20.0,50.0,0.10752,-9.083041,80.501632,0.0,Reject H0


### 2.3.1 Quantitative Data Processing Plan
1. id: Impute missing id.
2. gender: Recode as Categorical Variables
3. race: Recode as Categorical Variables
4. pid: Impute 10 missing pid values.
5. Change partner to pid and pid to piid for clarity.
6. field_cd: Update field_cd for 'Operations Research' to 8 for Business/Econ/Finance
7. career_c: Address missing data

Encoding: Use descriptive encoding for target variables
1. decision
2. decision_o
3. match

Encoding cross-references
Create label cross reference tables so that they are available for plotting.

#### 2.3.1.1 Missing id
Since each subject has a unique iid, we can impute the missing id by finding the associated id for the same iid in another observation.

In [6]:
iid = df.loc[df['id'].isna()]['iid'].tolist()
id = df.loc[df['iid'] == iid]['id'].unique()
df.loc[(df.id.isna()), 'id'] = id[0]

#### 2.3.2.2  Recode Gender 
Recode gender to "Male" and "Female" for reporting

In [7]:
df['gender'] = np.where(df['gender'] == 0, "Female", "Male")

#### 2.3.2.3 Race Encode

In [8]:
df['race'] = np.where(df['race'] == 1, 'Black',
                     np.where(df['race'] == 2, 'Caucasian',
                             np.where(df['race'] == 3, "Latino",
                                     np.where(df['race'] == 4, "Asian",
                                             np.where(df['race'] == 5, "Native American", "Other")))))

In [9]:
df['race_o'] = np.where(df['race_o'] == 1, 'Black',
                     np.where(df['race_o'] == 2, 'Caucasian',
                             np.where(df['race_o'] == 3, "Latino",
                                     np.where(df['race_o'] == 4, "Asian",
                                             np.where(df['race_o'] == 5, "Native American", "Other")))))

#### 2.3.1.2 Missing piid and Column Rename
First, rename partner to pid and pid to piid for clarity. Now we have 10 missing piid's - the partner's iid number. As it turns out, all 10 are for pid number 7, from wave 5. We can obtain the missing piid from the iid for id number 7 of the same wave. 

In [10]:
df = df.rename(columns={'pid': 'piid'})
df = df.rename(columns={'partner':'pid'})
wave_pid = df.loc[df['piid'].isna()][['wave','pid']].drop_duplicates()
piid = df.loc[(df['wave'] == wave_pid.wave.tolist()) & (df['id'] == wave_pid.pid.tolist())]['iid'].drop_duplicates().tolist()
df.loc[(df.piid.isna()), 'piid'] = piid

#### 2.3.2.3 Update Field_Cd for Operations Research

In [11]:
df.loc[df['field'] == 'Operations Research', 'field_cd'] = 8

#### 2.3.2.4 Career_c Missing Data

In [12]:
df.loc[df['career'] == 'lawyer', 'career_c'] = 1
df.loc[df['career'] == 'law', 'career_c'] = 1
df.loc[df['career'] == 'Economist', 'career_c'] = 7
df.loc[df['career'] == 'tech professional', 'career_c'] = 15
df.loc[df['career'].isnull(), 'career_c'] = 10 # NaNs converted to undecided

#### 2.3.2.5 Convert select binary variables to categorical

In [13]:
df['dec'] = np.where(df['dec']==0, 'No', 'Yes')
df['dec_o'] = np.where(df['dec_o']==0, 'No', 'Yes')
df['match'] = np.where(df['match']==0, 'Not Matched', 'Matched')
df['condtn'] = np.where(df['condtn']==1, 'Limited Choice', 'Extensive Choice')
df['samerace'] = np.where(df['samerace']==1, 'Same Race', 'Not Same Race')
df['met'] = np.where(df['met']==1, 'Met', 'Not Met')
df['met_o'] = np.where(df['met_o']==1, 'Met', 'Not Met')

#### 2.3.2.6 Difference in Age [Male-Female]

In [1]:
df['age_diff'] = np.where(df['gender'] == 'Male', df['age']-df['age_o'], df['age_o']-df['age'])

NameError: name 'np' is not defined

#### 2.3.2.7 Encoding Cross-Reference Table

In [14]:
# Field_cd
df_labels = pd.DataFrame()

field = ['field_cd'] * 18
code = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,]
value = ['Law','Math','Social Science, Psychologist','Medical Science, Pharmaceuticals, and Bio Tech','Engineering',
         'English/Creative Writing/ Journalism','History/Religion/Philosophy','Business/Econ/Finance',
         'Education, Academia','Biological Sciences/Chemistry/Physics','Social Work','Undergrad/undecided',
         'Political Science/International Affairs','Film','Fine Arts/Arts Administration','Languages',
         'Architecture','Other']
dfl = pd.DataFrame({'field':field, 'code': code, 'value':value})
df_labels = pd.concat([df_labels, dfl], axis=0)


In [15]:
# Goal
field = ['goal'] * 6
code = [1,2,3,4,5,6]
value = ['Seemed like a fun night out', 'To meet new people', 'To get a date', 'Looking for a serious relationship',
         'To say I did it', 'Other']
dfl = pd.DataFrame({'field':field, 'code': code, 'value':value})
df_labels = pd.concat([df_labels, dfl], axis=0)

In [16]:
# Date
field = ['date'] * 7
code = [1,2,3,4,5,6,7]
value = ['Several times a week','Twice a week','Once a week','Twice a month','Once a month','Several times a year',
         'Almost never']
dfl = pd.DataFrame({'field':field, 'code': code, 'value':value})
df_labels = pd.concat([df_labels, dfl], axis=0)

In [17]:
# Go_out
field = ['go_out'] * 7
code = [1,2,3,4,5,6,7]
value = ['Several times a week','Twice a week','Once a week','Twice a month','Once a month','Several times a year',
         'Almost never']
dfl = pd.DataFrame({'field':field, 'code': code, 'value':value})
df_labels = pd.concat([df_labels, dfl], axis=0)

In [18]:
# Career_c
field = ['career_c'] * 17
code = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
value = ['Lawyer','Academic/Research','Psychologist','Doctor/Medicine','Engineer','Creative Arts/Entertainment',
         'Banking/Consulting/Finance/Marketing/Business/CEO/Entrepreneur/Admin','Real Estate',
         'International/Humanitarian Affairs','Undecided','Social Work','Speech Pathology','Politics',
         'Pro sports/Athletics','Other','Journalism','Architecture']
dfl = pd.DataFrame({'field':field, 'code': code, 'value':value})
df_labels = pd.concat([df_labels, dfl], axis=0)

In [19]:
# length
field = ['length'] * 3
code = [1,2,3]
value = ['Too Little', 'Too Much', 'Just Right']
dfl = pd.DataFrame({'field':field, 'code': code, 'value':value})
df_labels = pd.concat([df_labels, dfl], axis=0)

In [20]:
# Numdat_2
field = ['numdat_2'] * 3
code = [1,2,3]
value = ['Too Few', 'Too Many', 'Just Right']
dfl = pd.DataFrame({'field':field, 'code': code, 'value':value})
df_labels = pd.concat([df_labels, dfl], axis=0)

### Save data 
Save training, validation and test sets in an interim directory for exploratory data analysis

In [21]:
def write(df, directory, filename):
    if isinstance(df, pd.DataFrame):
        if isinstance(filename, str):
            if not os.path.isdir(directory):
                os.mkdir(directory)
            df.to_csv(os.path.join(directory, filename),
                      index=False, index_label=False)
            return(True)
        else:
            return(False)
    else:
        return(False)

In [22]:
write(df, directories.INTERIM_DATA_DIR, filenames.INTERIM_FILENAME)
write(df_labels, directories.EXTERNAL_DATA_DIR, filenames.LABELS_FILENAME)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 195 entries, iid to amb5_3
dtypes: float64(175), int64(7), object(13)
memory usage: 12.5+ MB
