In [9]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
import pandas as pd
import numpy as np
import pickle

In [10]:
#Read csv
raw_df = pd.read_csv('exercise_20_test.csv')

In [11]:
#Seperate categorical variables
category_df = raw_df.loc[:,raw_df.dtypes == np.object]


In [12]:
#Seperate numerical variables
num_df = raw_df.loc[:,raw_df.dtypes == np.float64]

In [13]:
#Open models
with open('imp', 'rb') as i:
    imp = pickle.load(i)
    
with open('le_dict', 'rb') as l:
    le_dict = pickle.load(l)
    
with open('enc', 'rb') as e:
    enc = pickle.load(e)

with open('scalar', 'rb') as s:
    scalar = pickle.load(s)

In [14]:

#convert categorical vars ($,%) to numeric
category_df['x41'] = category_df['x41'].str.replace('$','').astype(float)
category_df['x45'] = category_df['x45'].str.replace('%','').astype(float)

#add converted vars to numeric df
num_df['x41'] = category_df['x41']
num_df['x45'] = category_df['x45']
category_df.drop(['x41','x45'],axis = 1, inplace = True)

#create mask to align same categories with different names
day_mask1 = category_df['x35'] == 'wednesday'
day_mask2 = category_df['x35'] == 'fri'
day_mask3 = category_df['x35']  == 'thur'

#align names
category_df['x35'][day_mask1] = 'wed'
category_df['x35'][day_mask2] = 'friday'
category_df['x35'][day_mask3] = 'thurday'

#perform inductive, single imputation with numerical df
#NOTE: Will not attempt multiple imputation
imp_num_df = pd.DataFrame(imp.transform(num_df))
imp_num_df.index = num_df.index
imp_num_df.columns = num_df.columns

In [15]:
#impute missing categorical vars as most frequent in category
for col in category_df:
    max_freq = category_df[col].value_counts().index[0]
    category_df[col][pd.isna(category_df[col])] = max_freq

In [16]:
#convert categories into nominal integers
label_df = pd.DataFrame()
for col in category_df:
    le = le_dict[col]
    label_df[col] = le.transform(category_df[col])

In [17]:
#create df of binary cols representing instance of each category across multipile columns
onehot_df = pd.DataFrame(enc.transform(label_df).toarray())
#feature_names = enc.get_feature_names()
#onehot_df.columns = feature_names

In [18]:
scaled_num_df = pd.DataFrame(scalar.transform(imp_num_df))

In [19]:
x_df = pd.concat([scaled_num_df, onehot_df], axis = 1)

In [20]:
#save x vars
with open('x_df_test', 'wb') as x:
    pickle.dump(x_df, x, pickle.HIGHEST_PROTOCOL)
