# Data Prep

In [1]:
import pandas as pd
import numpy as np
import math 
import pickle

import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML

from sklearn import preprocessing

In [2]:
train_df = pd.read_pickle('cleaned_train_df.pkl')

In [3]:
test_df = pd.read_pickle('cleaned_test_df.pkl')

In [4]:
with open('con_fields.pkl', 'rb') as f:
    con_fields = pickle.load(f)
with open('nom_fields.pkl', 'rb') as f:
    nom_fields = pickle.load(f)

In [5]:
[
 'education', # reduce num cats
 'family members under 18', # use
 'country of birth self', # reduce to USA/other
 'tax filer stat', # use
 'class of worker', # use
 'year', # use
 'migration code-change in reg', # use
 'marital stat', # use
 'detailed household and family stat', # skip
 'migration code-change in msa', # skip
 'major industry code', # use
 'detailed occupation recode', # use
 'sex', # use
 'region of previous residence', # use
 'migration code-move within reg', # use
 'race', # use
 'hispanic origin', # use
 'enroll in edu inst last wk', # use
 'member of a labor union', # use
 'country of birth mother', # reduce to USA/other
 'target', # use
 'migration prev res in sunbelt', # skip
 'detailed industry recode', # use
 'live in this house 1 year ago', # use
 'veterans benefits', # use
 'own business or self employed', # use
 'full or part time employment stat', # use
 'country of birth father', # reduce to USA/other
 'citizenship', # use
 'state of previous residence', # use
 'major occupation code', # use
 "fill inc questionnaire for veteran's admin", # skip
 'reason for unemployment', # use
 'detailed household summary in household' # use
]

['education',
 'family members under 18',
 'country of birth self',
 'tax filer stat',
 'class of worker',
 'year',
 'migration code-change in reg',
 'marital stat',
 'detailed household and family stat',
 'migration code-change in msa',
 'major industry code',
 'detailed occupation recode',
 'sex',
 'region of previous residence',
 'migration code-move within reg',
 'race',
 'hispanic origin',
 'enroll in edu inst last wk',
 'member of a labor union',
 'country of birth mother',
 'target',
 'migration prev res in sunbelt',
 'detailed industry recode',
 'live in this house 1 year ago',
 'veterans benefits',
 'own business or self employed',
 'full or part time employment stat',
 'country of birth father',
 'citizenship',
 'state of previous residence',
 'major occupation code',
 "fill inc questionnaire for veteran's admin",
 'reason for unemployment',
 'detailed household summary in household']

### Education

In [6]:
education_map = {
    'Children': 'Children', 
    'Less than 1st grade': 'Less than 1st grade', 
    '1st 2nd 3rd or 4th grade': 'Elementary school', 
    '5th or 6th grade': 'Middle school', 
    '7th and 8th grade': 'Middle school', 
    '9th grade': 'Some high school', 
    '10th grade': 'Some high school',  
    '11th grade': 'Some high school', 
    '12th grade no diploma': 'Some high school', 
    'High school graduate': 'High school graduate',
    'Some college but no degree': 'Some college but no degree', 
    'Bachelors degree(BA AB BS)': 'Bachelors degree', 
    'Masters degree(MA MS MEng MEd MSW MBA)': 'Masters degree', 
    'Associates degree-occup /vocational': 'Associates degree-occup /vocational', 
    'Associates degree-academic program': 'Associates degree-academic program', 
    'Doctorate degree(PhD EdD)': 'Doctorate degree(PhD EdD)', 
    'Prof school degree (MD DDS DVM LLB JD)': 'Prof school degree (MD DDS DVM LLB JD)'
}

In [7]:
train_df['education'] = train_df['education'].str.strip().map(education_map)
test_df['education'] = test_df['education'].str.strip().map(education_map)

In [8]:
train_df['education'].head()

0          High school graduate
1    Some college but no degree
2              Some high school
3                      Children
4                      Children
Name: education, dtype: object

### Countries

In [9]:
train_df['country of birth self'] = train_df['country of birth self']\
    .str.strip().apply(
        lambda x: 'United States' if x == 'United-States' else 'other'
    )

test_df['country of birth self'] = test_df['country of birth self']\
    .str.strip().apply(
        lambda x: 'United States' if x == 'United-States' else 'other'
    )

In [10]:
country_field = 'country of birth mother'
train_df[country_field] = train_df[country_field].str.strip().apply(
        lambda x: 'United States' if x == 'United-States' else 'other'
    )

test_df[country_field] = test_df[country_field].str.strip().apply(
        lambda x: 'United States' if x == 'United-States' else 'other'
    )

In [11]:
country_field = 'country of birth father'
train_df[country_field] = train_df[country_field].str.strip().apply(
        lambda x: 'United States' if x == 'United-States' else 'other'
    )

test_df[country_field] = test_df[country_field].str.strip().apply(
        lambda x: 'United States' if x == 'United-States' else 'other'
    )

## Encoding categorical vars

In [12]:
vars_to_encode = [
 'education', # reduce num cats, ordinal
 'family members under 18', # use
 'country of birth self', # reduce to USA/other
 'tax filer stat', # use
 'class of worker', # use
 'year', # use
 'migration code-change in reg', # use
 'marital stat', # use
 # 'detailed household and family stat', # skip
 # 'migration code-change in msa', # skip
 'major industry code', # use
 'detailed occupation recode', # use
 'sex', # use
 'region of previous residence', # use
 'migration code-move within reg', # use
 'race', # use
 'hispanic origin', # use
 'enroll in edu inst last wk', # use
 'member of a labor union', # use
 'country of birth mother', # reduce to USA/other or skip
 # 'target', # use as y/target
 # 'migration prev res in sunbelt', # skip
 'detailed industry recode', # use
 'live in this house 1 year ago', # use
 'veterans benefits', # use
 'own business or self employed', # use
 'full or part time employment stat', # use
 'country of birth father', # reduce to USA/other or skip
 'citizenship', # use
 'state of previous residence', # use
 'major occupation code', # use
 # "fill inc questionnaire for veteran's admin", # skip
 'reason for unemployment', # use
 'detailed household summary in household' # use
]

### One-Hot encoding

In [13]:
# education_categories = [
#     'Children', 
#     'Less than 1st grade', 
#     'Elementary school', 
#     'Middle school', 
#     'Some high school', 
#     'High school graduate',
#     'Some college but no degree', 
#     'Bachelors degree', 
#     'Masters degree', 
#     'Associates degree-occup /vocational', 
#     'Associates degree-academic program', 
#     'Doctorate degree(PhD EdD)', 
#     'Prof school degree (MD DDS DVM LLB JD)'
# ]

In [14]:
ohe = preprocessing.OneHotEncoder(
    sparse_output=False, 
#    drop='first'
)

#### Train

In [15]:
ohe.fit(train_df[vars_to_encode])

In [16]:
X_nom_train_enc = ohe.transform(train_df[vars_to_encode])

In [17]:
ohe.categories_

[array(['Associates degree-academic program',
        'Associates degree-occup /vocational', 'Bachelors degree',
        'Children', 'Doctorate degree(PhD EdD)', 'Elementary school',
        'High school graduate', 'Less than 1st grade', 'Masters degree',
        'Middle school', 'Prof school degree (MD DDS DVM LLB JD)',
        'Some college but no degree', 'Some high school'], dtype=object),
 array([' Both parents present', ' Father only present',
        ' Mother only present', ' Neither parent present',
        ' Not in universe'], dtype=object),
 array(['United States', 'other'], dtype=object),
 array([' Head of household', ' Joint both 65+', ' Joint both under 65',
        ' Joint one under 65 & one 65+', ' Nonfiler', ' Single'],
       dtype=object),
 array([' Federal government', ' Local government', ' Never worked',
        ' Not in universe', ' Private', ' Self-employed-incorporated',
        ' Self-employed-not incorporated', ' State government',
        ' Without pay'], dty

In [18]:
encoded_categories = []
for i,v in enumerate(vars_to_encode):
    for x in ohe.categories_[i]:
        c = f'{v}: {x}'
        encoded_categories.append(c)

In [19]:
X_nom_train_enc_df = pd.DataFrame(X_nom_train_enc, 
                                  columns=encoded_categories)

In [20]:
train_df_enc = train_df[con_fields].join(X_nom_train_enc_df)

In [21]:
len(train_df_enc)

199523

In [22]:
train_df_enc.to_pickle('preprocessed_X_train_df.pkl')

#### Test

In [23]:
X_nom_test_enc = ohe.transform(test_df[vars_to_encode])

In [24]:
X_nom_test_enc_df = pd.DataFrame(X_nom_test_enc, 
                                  columns=encoded_categories)

In [25]:
test_df_enc = test_df[con_fields].join(X_nom_test_enc_df)

In [26]:
len(test_df_enc)

99762

In [27]:
test_df_enc.to_pickle('preprocessed_X_test_df.pkl')

## Encode target variable

In [28]:
le = preprocessing.LabelEncoder()

In [29]:
train_df['target']

0          - 50000.
1          - 50000.
2          - 50000.
3          - 50000.
4          - 50000.
            ...    
199518     - 50000.
199519     - 50000.
199520     - 50000.
199521     - 50000.
199522     - 50000.
Name: target, Length: 199523, dtype: object

In [30]:
le.fit(train_df['target'])

In [31]:
le.classes_

array([' - 50000.', ' 50000+.'], dtype=object)

In [32]:
y_train = le.transform(train_df['target'])

In [33]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [34]:
with open('y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f) 

In [35]:
y_test = le.transform(test_df['target'])

In [36]:
with open('y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f) 

In [37]:
with open('y_label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f) 