In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
 # explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
import pickle

In [14]:
raw_df = pd.read_csv('exercise_20_train.csv')

In [15]:
#Inspect data
raw_df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
0,0.963686,6.627185,-45.224008,9.477531,-3.216532,13.216874,9.754747,5.245851,-1.102918,-2.867482,...,0.988829,0.313772,asia,1.380664,-16.388994,5.32673,4.187294,0.045549,-3.646841,0
1,-1.770062,-23.610459,-0.964003,-31.981497,-10.294599,-10.240251,-1.518888,-1.675208,0.498134,-0.61439,...,-2.162863,1.809807,asia,2.50059,4.338834,-1.583225,-1.172417,0.011216,0.09718,0
2,9.962401,-8.349849,23.248891,-24.196879,8.93748,10.965,-7.490596,-3.025094,0.595807,0.382732,...,1.77966,9.528113,asia,1.396475,7.839188,10.402396,1.288991,0.008209,-4.132316,0
3,-5.780709,-25.261584,1.383115,-11.786929,7.993078,-11.245752,-2.607351,-3.513896,-0.614235,-1.453979,...,-0.203206,4.892248,asia,0.744317,7.380982,7.599323,-8.022884,-0.067624,-1.796198,0
4,1.211541,1.119963,7.512938,21.987312,-5.155392,10.339416,3.04518,-0.61923,-0.928068,0.405024,...,0.248724,18.69499,asia,1.703196,-11.552129,0.381768,-3.550471,-0.05518,-3.34449,0


In [16]:
#Seperate categorical variables
category_df = raw_df.loc[:,raw_df.dtypes == np.object]

In [17]:
#Seperate numerical variables
num_df = raw_df.loc[:,raw_df.dtypes == np.float64]

In [18]:
#summarize categorical variables
category_df.describe(include=[np.object])

Unnamed: 0,x34,x35,x41,x45,x68,x93
count,39992,39990,39996,39993,39991,39993
unique,10,8,37817,10,12,3
top,volkswagon,wed,$-370.55,0.01%,July,asia
freq,12622,14820,4,9610,11114,35384


In [19]:
#convert categorical vars ($,%) to numeric
category_df['x41'] = category_df['x41'].str.replace('$','').astype(float)
category_df['x45'] = category_df['x45'].str.replace('%','').astype(float)

In [20]:
#add converted vars to numeric df
num_df['x41'] = category_df['x41']
num_df['x45'] = category_df['x45']
category_df.drop(['x41','x45'],axis = 1, inplace = True)

In [21]:
#print counts of each categorical variable
for col in category_df:
    print(category_df[col].value_counts(dropna=False),'\n')

volkswagon    12622
Toyota        10968
bmw            7262
Honda          5174
tesla          2247
chrystler      1191
nissan          326
ford            160
mercades         31
chevrolet        11
NaN               8
Name: x34, dtype: int64 

wed          14820
thurday      13324
wednesday     5938
thur          4428
tuesday        884
friday         517
monday          53
fri             26
NaN             10
Name: x35, dtype: int64 

July       11114
Jun         9317
Aug         8170
May         4744
sept.       3504
Apr         1629
Oct          885
Mar          407
Nov          145
Feb           48
Dev           16
January       12
NaN            9
Name: x68, dtype: int64 

asia       35384
america     3167
euorpe      1442
NaN            7
Name: x93, dtype: int64 



In [22]:
#create mask to align same categories with different names
day_mask1 = category_df['x35'] == 'wednesday'
day_mask2 = category_df['x35'] == 'fri'
day_mask3 = category_df['x35']  == 'thur'

In [23]:
#align names
category_df['x35'][day_mask1] = 'wed'
category_df['x35'][day_mask2] = 'friday'
category_df['x35'][day_mask3] = 'thurday'

In [24]:
#perform inductive, single imputation with numerical df
#NOTE: Will not attempt multiple imputation
imp = IterativeImputer()
imp_num_df = pd.DataFrame(imp.fit_transform(num_df))
imp_num_df.index = num_df.index
imp_num_df.columns = num_df.columns

In [25]:
#Save imputation fit
with open('imp', 'wb') as i:
    pickle.dump(imp, i, pickle.HIGHEST_PROTOCOL)

In [26]:
#impute missing categorical vars as most frequent in category
for col in category_df:
    max_freq = category_df[col].value_counts().index[0]
    category_df[col][pd.isna(category_df[col])] = max_freq

In [27]:
#convert categories into nominal integers
le_dict = {}
label_df = pd.DataFrame()
for col in category_df:
    le = LabelEncoder()
    label_df[col] = le.fit_transform(category_df[col])
    le_dict[col] = le

In [28]:
#save lable enc
with open('le_dict', 'wb') as l:
    pickle.dump(le_dict, l, pickle.HIGHEST_PROTOCOL)

In [29]:
#create df of binary cols representing instance of each category across multipile columns
enc = OneHotEncoder()
onehot_df = pd.DataFrame(enc.fit_transform(label_df).toarray())
#feature_names = enc.get_feature_names()
#onehot_df.columns = feature_names

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [21]:
#save one-hot-enc
with open('enc', 'wb') as e:
    pickle.dump(enc, e, pickle.HIGHEST_PROTOCOL)

In [22]:
scalar = StandardScaler()
scaled_num_df = pd.DataFrame(scalar.fit_transform(imp_num_df))

In [23]:
#save scalar
with open('scalar', 'wb') as s:
    pickle.dump(scalar, s, pickle.HIGHEST_PROTOCOL)

In [24]:
#create df of x vars from imputed df and encoded df
x_df = pd.concat([scaled_num_df, onehot_df], axis = 1)

In [25]:
#save x vars
with open('x_df', 'wb') as x:
    pickle.dump(x_df, x, pickle.HIGHEST_PROTOCOL)


In [26]:
#sepearte dependant 'y' binary variable
y_df = raw_df['y'].astype('int')


In [27]:
#save y var
with open('y_df', 'wb') as y:
    pickle.dump(y_df, y, pickle.HIGHEST_PROTOCOL)

In [5]:

pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.
