In [1]:
import numpy as np
import pandas as pd
import sys
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
from joblib import dump, load

In [2]:
raw_train=pd.read_csv('exercise_26_train.csv')

In [3]:
train_val = raw_train.copy(deep=True)

#1. Fixing the money and percents#
train_val['x12'] = train_val['x12'].str.replace('$','')
train_val['x12'] = train_val['x12'].str.replace(',','')
train_val['x12'] = train_val['x12'].str.replace(')','')
train_val['x12'] = train_val['x12'].str.replace('(','-')
train_val['x12'] = train_val['x12'].astype(float)
train_val['x63'] = train_val['x63'].str.replace('%','')
train_val['x63'] = train_val['x63'].astype(float)

In [4]:
# 2. Creating the train/val/test set
x_train, x_val, y_train, y_val = train_test_split(train_val.drop(columns=['y']), train_val['y'], test_size=0.1, random_state=13)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=4000, random_state=13)

In [5]:
# 3. smashing sets back together
train = pd.concat([x_train, y_train], axis=1, sort=False).reset_index(drop=True)
val = pd.concat([x_val, y_val], axis=1, sort=False).reset_index(drop=True)
test = pd.concat([x_test, y_test], axis=1, sort=False).reset_index(drop=True)

In [6]:
# 3. With mean imputation from Train set

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
train_imputed = pd.DataFrame(imputer.fit_transform(train.drop(columns=['y', 'x5', 'x31',  'x81' ,'x82'])), columns=train.drop(columns=['y', 'x5', 'x31', 'x81', 'x82']).columns)
std_scaler = StandardScaler()
train_imputed_std = pd.DataFrame(std_scaler.fit_transform(train_imputed), columns=train_imputed.columns)

"""
pickle simple imputer and Standard scaler
"""
dump(imputer, 'imputer.joblib') 
dump(std_scaler, 'std_scaler.joblib') 

train_imputed_std.head()

Unnamed: 0,x0,x1,x2,x3,x4,x6,x7,x8,x9,x10,...,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99
0,-0.672775,0.049379,-0.609119,-0.021764,-0.486957,-1.191547,0.196476,-0.475264,1.466229,-0.51099,...,-0.224951,0.533302,1.473799,-1.085121,0.063268,-0.20299,-0.56774,1.19156,1.265523,-0.927283
1,-1.149312,0.168114,1.231257,0.919274,0.0,-0.6165471,0.625345,0.741159,1.640918,-1.484743,...,-0.493539,-1.620501,0.663637,0.04773944,-0.272964,0.255222,0.028684,0.432182,1.603005,-1.114497
2,-0.516135,1.788307,-1.65263,-0.316264,-1.273636,-0.2460788,1.019506,0.818925,-0.047945,1.737675,...,-0.223524,-1.199278,1.684918,-0.7872544,-0.458898,0.858933,-1.023754,-0.209117,-0.105748,0.969588
3,-1.381878,1.253223,-1.145135,-1.075081,-0.17816,0.5998355,0.429411,1.055405,1.667284,-0.718742,...,-0.318542,-0.957032,0.062536,-1.185996,-0.327383,-1.466744,0.565616,0.547652,1.124988,0.764258
4,0.077947,-0.600258,0.184995,0.808261,0.639909,-3.857178e-16,-0.277005,1.23914,-0.322919,-0.476666,...,1.051483,-1.311846,-0.47313,1.555481e-15,0.968679,0.724372,-0.242202,-0.909825,-0.227995,0.216403


In [7]:
"""
Pickle all ohe
"""

dumb5_ohe = OneHotEncoder(drop='first',sparse=False)
dumb5 = dumb5_ohe.fit_transform(train['x5'].fillna('NaN').values.reshape(-1, 1))

train_imputed_std = pd.concat([train_imputed_std, 
                               pd.DataFrame(dumb5,columns=[f'x5_{i}' for i in dumb5_ohe.categories_[0][1:]])], 
                              axis=1, sort=False)
dump(dumb5_ohe, 'dumb5_ohe.joblib') 
dumb31_ohe = OneHotEncoder(drop='first',sparse=False)
dumb31 = dumb31_ohe.fit_transform(train['x31'].fillna('NaN').values.reshape(-1,1))
train_imputed_std = pd.concat([train_imputed_std, 
                               pd.DataFrame(dumb31,columns=[f'x31_{i}' for i in dumb31_ohe.categories_[0][1:]])], 
                              axis=1, sort=False)
dump(dumb31_ohe, 'dumb31_ohe.joblib') 
dumb81_ohe = OneHotEncoder(drop='first',sparse=False)
dumb81 = dumb81_ohe.fit_transform(train['x81'].fillna('NaN').values.reshape(-1,1))
train_imputed_std = pd.concat([train_imputed_std, 
                               pd.DataFrame(dumb81,columns=[f'x81_{i}' for i in dumb81_ohe.categories_[0][1:]])], 
                              axis=1, sort=False)
dump(dumb81_ohe, 'dumb81_ohe.joblib') 
dumb82_ohe = OneHotEncoder(drop='first',sparse=False)
dumb82 = dumb82_ohe.fit_transform(train['x82'].fillna('NaN').values.reshape(-1,1))
train_imputed_std = pd.concat([train_imputed_std, 
                               pd.DataFrame(dumb82,columns=[f'x82_{i}' for i in dumb82_ohe.categories_[0][1:]])], 
                              axis=1, sort=False)
dump(dumb82_ohe, 'dumb82_ohe.joblib') 
train_imputed_std = pd.concat([train_imputed_std, train['y']], axis=1, sort=False)


train_imputed_std.head()

Unnamed: 0,x0,x1,x2,x3,x4,x6,x7,x8,x9,x10,...,x81_January,x81_July,x81_June,x81_March,x81_May,x81_November,x81_October,x81_September,x82_Male,y
0,-0.672775,0.049379,-0.609119,-0.021764,-0.486957,-1.191547,0.196476,-0.475264,1.466229,-0.51099,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,-1.149312,0.168114,1.231257,0.919274,0.0,-0.6165471,0.625345,0.741159,1.640918,-1.484743,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,-0.516135,1.788307,-1.65263,-0.316264,-1.273636,-0.2460788,1.019506,0.818925,-0.047945,1.737675,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,-1.381878,1.253223,-1.145135,-1.075081,-0.17816,0.5998355,0.429411,1.055405,1.667284,-0.718742,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.077947,-0.600258,0.184995,0.808261,0.639909,-3.857178e-16,-0.277005,1.23914,-0.322919,-0.476666,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [8]:
pd.DataFrame(dumb5,columns=[f'dumb5{i}' for i in range(dumb5.shape[1])])

Unnamed: 0,dumb50,dumb51,dumb52,dumb53,dumb54,dumb55
0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
31995,0.0,0.0,0.0,0.0,0.0,0.0
31996,0.0,0.0,0.0,0.0,0.0,1.0
31997,0.0,0.0,0.0,0.0,0.0,0.0
31998,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
exploratory_LR = LogisticRegression(penalty='l1', fit_intercept=False, solver='liblinear')
exploratory_LR.fit(train_imputed_std.drop(columns=['y']), train_imputed_std['y'])
exploratory_results = pd.DataFrame(train_imputed_std.drop(columns=['y']).columns).rename(columns={0:'name'})
exploratory_results['coefs'] = exploratory_LR.coef_[0]
exploratory_results['coefs_squared'] = exploratory_results['coefs']**2
var_reduced = exploratory_results.nlargest(25,'coefs_squared')

In [10]:
"""
Save this variable in file
"""
var_reduced['name'].values

dump(var_reduced['name'].values, 'var_reduced.joblib') 

['var_reduced.joblib']

In [11]:
variables = var_reduced['name'].to_list()
logit = sm.Logit(train_imputed_std['y'], train_imputed_std[variables])
result = logit.fit()
dump(result, 'result.joblib') 
result.summary()

Optimization terminated successfully.
         Current function value: 0.527885
         Iterations 6


0,1,2,3
Dep. Variable:,y,No. Observations:,32000.0
Model:,Logit,Df Residuals:,31975.0
Method:,MLE,Df Model:,24.0
Date:,"Sat, 18 Jul 2020",Pseudo R-squ.:,0.2384
Time:,12:06:52,Log-Likelihood:,-16892.0
converged:,True,LL-Null:,-22181.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x31_japan,1.7274,0.061,28.219,0.000,1.607,1.847
x5_saturday,-1.4688,0.043,-34.490,0.000,-1.552,-1.385
x31_america,1.1869,0.037,31.991,0.000,1.114,1.260
x5_sunday,-1.1221,0.041,-27.040,0.000,-1.203,-1.041
x5_monday,-0.9770,0.041,-23.626,0.000,-1.058,-0.896
x91,0.7813,0.014,54.504,0.000,0.753,0.809
x5_tuesday,-0.6713,0.041,-16.376,0.000,-0.752,-0.591
x31_germany,0.6559,0.029,22.657,0.000,0.599,0.713
x53,-0.6310,0.014,-45.143,0.000,-0.658,-0.604


# Testing

In [12]:
import numpy as np
import pandas as pd
import joblib

In [13]:
raw_test=pd.read_csv('exercise_26_test.csv').head()

In [18]:

# len(raw_test)
raw = []
for i in range(1):
    dic = {}
    for j in raw_test.columns.values:
        dic[j] = raw_test[j].values[i]
    raw.append(dic)

In [19]:
import json
print(json.dumps(raw))

[{"x0": 0.042317, "x1": -3.3447210000000003, "x2": 4.635124212161472, "x3": -0.5983959993003629, "x4": -0.6477715045570444, "x5": "monday", "x6": 0.184902, "x7": 46.690015, "x8": 3.034132, "x9": 0.364704, "x10": 14.260732999999998, "x11": -1.559332, "x12": "$5,547.78", "x13": 0.520324, "x14": 31.212255, "x15": 4.891671, "x16": 0.357763, "x17": 14.766366, "x18": -17.467243, "x19": 0.22462800000000002, "x20": 0.096752, "x21": 1.305564, "x22": 0.353632, "x23": 3.9090279999999997, "x24": -91.273052, "x25": 1.396952, "x26": 4.401593, "x27": 0.443086, "x28": 14.048786999999999, "x29": -0.932243, "x30": 5.255472, "x31": "germany", "x32": 0.54199153, "x33": 2.98948039, "x34": -1.78334189, "x35": 0.80127315, "x36": -2.60231221, "x37": 3.39682926, "x38": -1.22322646, "x39": -2.20977636, "x40": -68.69, "x41": 522.25, "x42": -428.69, "x43": 381.37, "x44": 0.0197503, "x45": 0.75116479, "x46": 0.8630479007977094, "x47": -1.0383166613479036, "x48": -0.27261876352216863, "x49": -0.3430207259042951, "x

In [20]:
def predict(data):
    
    if type(data) == type({}):
        raw_test = pd.DataFrame(data,index = [0])
    else:
        raw_test = pd.DataFrame(data)
        
    
    raw_test['x12'] = raw_test['x12'].str.replace('$','')
    raw_test['x12'] = raw_test['x12'].str.replace(',','')
    raw_test['x12'] = raw_test['x12'].str.replace(')','')
    raw_test['x12'] = raw_test['x12'].str.replace('(','-')
    raw_test['x12'] = raw_test['x12'].astype(float)
    raw_test['x63'] = raw_test['x63'].str.replace('%','')
    raw_test['x63'] = raw_test['x63'].astype(float)
    
    
    test = raw_test.drop(columns=['x5', 'x31',  'x81' ,'x82'])
    
    
    imputer = joblib.load('Modelimputer.joblib')
    std_scaler = joblib.load('std_scaler.joblib')

    test_std = pd.DataFrame(imputer.transform(test) , columns=test.columns)
    test_std = pd.DataFrame(std_scaler.transform(test_std) , columns=test.columns)
    
    
    dumb5_ohe = joblib.load('dumb5_ohe.joblib')
    dumb5 = dumb5_ohe.transform(raw_test['x5'].fillna('NaN').values.reshape(-1, 1))
    test_std = pd.concat([test_std, 
                                   pd.DataFrame(dumb5,columns=[f'x5_{i}' for i in dumb5_ohe.categories_[0][1:]])], 
                                  axis=1, sort=False)

    dumb31_ohe = joblib.load('dumb31_ohe.joblib')
    dumb31 = dumb31_ohe.transform(raw_test['x31'].fillna('NaN').values.reshape(-1, 1))
    test_std = pd.concat([test_std, 
                                   pd.DataFrame(dumb31,columns=[f'x31_{i}' for i in dumb31_ohe.categories_[0][1:]])], 
                                  axis=1, sort=False)

    dumb81_ohe = joblib.load('dumb81_ohe.joblib')
    dumb81 = dumb81_ohe.transform(raw_test['x81'].fillna('NaN').values.reshape(-1, 1))
    test_std = pd.concat([test_std, 
                                   pd.DataFrame(dumb81,columns=[f'x81_{i}' for i in dumb81_ohe.categories_[0][1:]])], 
                                  axis=1, sort=False)

    dumb82_ohe = joblib.load('dumb82_ohe.joblib')
    dumb82 = dumb82_ohe.transform(raw_test['x82'].fillna('NaN').values.reshape(-1, 1))
    test_std = pd.concat([test_std, 
                                   pd.DataFrame(dumb82,columns=[f'x82_{i}' for i in dumb82_ohe.categories_[0][1:]])], 
                                  axis=1, sort=False)
    
    
    var_reduced = joblib.load('var_reduced.joblib')
    test = test_std[var_reduced]
    
    
    result = joblib.load('result.joblib')
    y_pred = result.predict(test)
    
    to_be_returned = []

    for i in range(len(y_pred)):
        dict = {}
        dict['class'] = int(np.where(y_pred[i]>0.5,1,0))
        dict['proba'] = float(np.where([dict['class']==1] , y_pred[i],1-y_pred[i])[0])
        dict['columns']  = var_reduced
        to_be_returned.append(dict)
        
    return to_be_returned

In [21]:
predict(raw)

[{'class': 0,
  'proba': 0.6364502399895486,
  'columns': array(['x31_japan', 'x5_saturday', 'x31_america', 'x5_sunday',
         'x5_monday', 'x91', 'x5_tuesday', 'x31_germany', 'x53', 'x44',
         'x5_wednesday', 'x81_July', 'x81_December', 'x12', 'x62',
         'x81_October', 'x81_January', 'x81_February', 'x81_August',
         'x5_thursday', 'x81_June', 'x81_May', 'x58', 'x56', 'x60'],
        dtype=object)}]

In [6]:
predict(raw[0])

[{'class': 0,
  'proba': 0.6364502399895486,
  'columns': array(['x31_japan', 'x5_saturday', 'x31_america', 'x5_sunday',
         'x5_monday', 'x91', 'x5_tuesday', 'x31_germany', 'x53', 'x44',
         'x5_wednesday', 'x81_July', 'x81_December', 'x12', 'x62',
         'x81_October', 'x81_January', 'x81_February', 'x81_August',
         'x5_thursday', 'x81_June', 'x81_May', 'x58', 'x56', 'x60'],
        dtype=object)}]