In [32]:
import zipfile as zp
from math import ceil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score, GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [4]:
df_list = []
for file in ["train.csv", "test.csv"]:
    with zp.ZipFile("./data.zip") as myzip:
        with myzip.open(file) as myfile:
            df_list.append(pd.read_csv(myfile))
            
train_df, test_df = df_list

In [5]:
# Correcting variables
train_df.rename({"Parents or siblings infected": "Parents_Children_Infected", 
                 "Wife/Husband or children infected": "Partner_Siblings_Infected"}, axis=1, inplace=True)
test_df.rename({"Parents or siblings infected": "Parents_Children_Infected", 
                "Wife/Husband or children infected": "Partner_Siblings_Infected"}, axis=1, inplace=True)

In [6]:
train_df.set_index("Patient_ID", inplace=True)
test_df.set_index("Patient_ID", inplace=True)

In [9]:
X_train = train_df.drop('Deceased', axis=1)
y_train=train_df['Deceased']

In [10]:
X_train.City.fillna(value="Santa Fe", inplace=True)

In [11]:
X_train.Medical_Tent = X_train.Medical_Tent.fillna(value="NK")
test_df.Medical_Tent = test_df.Medical_Tent.fillna(value="NK")

In [12]:
X_train['Title'] = X_train['Name'].str.split('\\W', 1, expand=True)[0]
test_df['Title'] = test_df['Name'].str.split('\\W', 1, expand=True)[0]

In [13]:
X_train['Title_binary'] = X_train['Title'].apply(lambda x: 1 if x in ["Master","Miss"] else 0)
test_df['Title_binary'] = test_df['Title'].apply(lambda x: 1 if x in ["Master","Miss"] else 0)

In [14]:
def scale_function(X_data, columns_to_norm, columns_not_norm, scaler_type, scaler=None):
    '''Function to apply normalization'''
    
    if scaler_type=='MinMaxScaler':
        if scaler is None:
            scaler = MinMaxScaler().fit(X_data[columns_to_norm])
        X_scaled = scaler.transform(X_data[columns_to_norm])
        
    else:
        if scaler is None:
            scaler = StandardScaler().fit(X_data[columns_to_norm])
        X_scaled = scaler.transform(X_data[columns_to_norm])
        
    X = np.append(X_data[columns_not_norm], X_scaled, axis=1)
    X = pd.DataFrame(X, columns = columns_not_norm+columns_to_norm)

    return X, scaler

In [15]:
scaler_type="MinMaxScaler"

In [16]:
def knn_imputer_birthday(X_train, test, scaler_type):
    cols_drop = ['Family_Case_ID','Name','Medical_Expenses_Family','Medical_Tent','City']
    columns_to_norm = ['Severity','Parents_Children_Infected','Partner_Siblings_Infected']
    columns_not_norm=['Birthday_year','Title_binary']
    
    X_use = X_train.copy()
    X_test = test.copy()
    
    fill_birthday_train, scaler = scale_function(X_use.drop(columns=cols_drop), columns_to_norm=columns_to_norm,
                                                 columns_not_norm=columns_not_norm, scaler_type=scaler_type)
    fill_birthday_test, scaler = scale_function(X_test.drop(columns=cols_drop), columns_to_norm=columns_to_norm,
                                                 columns_not_norm=columns_not_norm, scaler_type=scaler_type)
    
    imputer = KNNImputer(n_neighbors=3, weights='distance').fit(fill_birthday_train)
    
    X_imputted_birthday_train = imputer.transform(fill_birthday_train)
    X_imputted_birthday_test = imputer.transform(fill_birthday_test)
    
    X_use.Birthday_year = list(pd.DataFrame(X_imputted_birthday_train).iloc[:,0]) # 0 is the index of Birthday_year
    X_test.Birthday_year = list(pd.DataFrame(X_imputted_birthday_test).iloc[:,0]) # 0 is the index of Birthday_year
    
    return X_use, X_test

In [17]:
X_train, test_df = knn_imputer_birthday(X_train, test_df, scaler_type=scaler_type)

In [18]:
X_train.Birthday_year = X_train.Birthday_year.round(0).astype(int)
test_df.Birthday_year = test_df.Birthday_year.round(0).astype(int)

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineering(BaseEstimator, TransformerMixin):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass    
        
    #Return self nothing else to do here
    def fit( self, X, y):
        self.familycount_ = X.Family_Case_ID.value_counts().to_dict()
        X_train_w_dec = pd.concat([X, y], axis=1)
        self.deceasedfamcount_ = X_train_w_dec.loc[X_train_w_dec['Deceased']==1].Family_Case_ID.value_counts().to_dict()
        return self

    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        
        X2 = X.copy()
        
        # CREATE NEW VARIABLES
        X2['Age'] = X2['Birthday_year'].map(lambda x: 2020 - x)
        X2['Gender'] = X2['Title'].map(lambda x: "Male" if x in ["Mr", "Master"] else "Female")
        X2['Parents_Children_Infected_Binary'] = X2['Parents_Children_Infected'].map(lambda x: 0 if x==0 else 1)  
        X2['Partner_Siblings_Infected_Binary'] = X2['Partner_Siblings_Infected'].map(lambda x: 0 if x==0 else 1)
        #X2["Pediatric_Binary"] = X2["Age"].map(lambda x: 1 if x < 18 else 0)
        #X2["3rd_Age_Binary"] = X2["Age"].map(lambda x: 1 if x >= 65 else 0)
        X2["Family_Infected"] = X2["Family_Case_ID"].map(self.familycount_)
        X2["Family_Infected_Binary"] = X2["Family_Infected"].map(lambda x: 0 if x==1 else 1)
        X2["Family_Deceased"] = (X2["Family_Case_ID"].map(self.deceasedfamcount_)-1).fillna(0)
        X2["Family_Deceased_Reduced"] = X2['Family_Deceased'].map(lambda x: x if x==0 else (2 if x>=3 else 1))
        #X2["Dead_infected_ratio_family"] = X2["Family_Deceased"] / X2["Family_Infected"]
        X2["Medical_Expenses_Individual"] = X2["Medical_Expenses_Family"] / X2["Family_Infected"]
        
        #FURTHER TRANSFORMATIONS
        X2["Medical_Expenses_Individual"][X2["Medical_Expenses_Individual"] > 3000] = 3000
        
        return X2 

In [42]:
class Transformations(BaseEstimator, TransformerMixin):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass
        
    #Return self nothing else to do here
    def fit( self, X, y):
        self.lambda_ = boxcox(X["Medical_Expenses_Individual"] + 1)[1]
        return self

    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        
        X["Medical_Expenses_Individual"] = boxcox(x=X["Medical_Expenses_Individual"]+1, alpha=self.lambda_)[0]
        
        #DROP INTERMEDIATE VARIABLES
        X = X.drop(["Family_Case_ID", "Name", "Birthday_year", "Title", "Title_binary", "Partner_Siblings_Infected", 
                "Medical_Tent", "Parents_Children_Infected", "Partner_Siblings_Infected_Binary", 
                "Parents_Children_Infected_Binary", "Family_Deceased", "Family_Infected", "Medical_Expenses_Family"], 
               axis=1)
        
        return X 

In [43]:
rf = Pipeline(steps = [ ('engineering', FeatureEngineering()),
                       ('transformation', Transformations())])

rf.fit(X_train, y_train)
X_class = rf.transform(X_train)

In [44]:
X_class

Unnamed: 0_level_0,Severity,City,Age,Gender,Family_Infected_Binary,Family_Deceased_Reduced,Medical_Expenses_Individual
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,Santa Fe,15,Female,0,0.0,10.782511
2,1,Albuquerque,54,Female,0,0.0,19.604952
3,3,Santa Fe,38,Male,0,0.0,10.720469
4,3,Santa Fe,23,Male,0,0.0,10.704824
5,3,Santa Fe,26,Female,0,0.0,10.736060
...,...,...,...,...,...,...,...
896,3,Santa Fe,22,Female,1,0.0,9.880166
897,3,Santa Fe,14,Male,0,0.0,11.264519
898,3,Taos,30,Female,0,0.0,10.609794
899,2,Santa Fe,26,Male,1,0.0,11.436989


In [24]:
X_train.head()

Unnamed: 0_level_0,Family_Case_ID,Severity,Name,Birthday_year,Parents_Children_Infected,Partner_Siblings_Infected,Medical_Expenses_Family,Medical_Tent,City,Title,Title_binary
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,4696,3,Miss Linda Betty,2005,0,0,225,NK,Santa Fe,Miss,1
2,21436,1,Ms. Ramona Elvira,1966,0,1,1663,NK,Albuquerque,Ms,0
3,7273,3,Mr. Mario Vernon,1982,0,0,221,NK,Santa Fe,Mr,0
4,8226,3,Mr. Hector Joe,1997,0,0,220,NK,Santa Fe,Mr,0
5,19689,3,Ms. Jennie Debra,1994,0,0,222,NK,Santa Fe,Ms,0


In [36]:
boxcox1, lambd = boxcox(x=X_train["Medical_Expenses_Family"]+1)
boxcox2 = boxcox(x=(X_train["Medical_Expenses_Family"]+1), alpha=lambd)[0]

In [37]:
boxcox1

array([ 8.29335234, 13.44665392,  8.25375037,  8.24375485,  8.26370756,
        0.        ,  9.82539017,  9.39625591,  9.85885546, 10.7382352 ,
       10.29558628,  9.91377963,  8.66813014, 12.28084603,  8.21353512,
        8.15201606, 20.99809385, 11.57285872, 10.58631735,  8.15201606,
       16.01158644, 11.07674833, 10.5206788 ,  9.81977449, 10.81142232,
        8.25375037,  0.        ,  8.21353512, 12.75525225,  8.25375037,
        9.78015346,  8.75203272,  9.65192557,  8.89675431, 12.75525225,
       15.65311109, 13.92156699,  9.48619387,  8.22364755,  8.06763831,
       13.92156699, 13.26663469,  8.39925503,  8.05689354,  8.21353512,
       11.23122765,  8.21353512,  8.89675431, 11.63510982, 14.35321037,
        8.18295861, 10.1032914 ,  8.51010722,  8.59905599,  8.21353512,
       16.48041675,  8.06763831,  8.29335234,  8.22364755, 13.08668403,
        8.29335234, 11.55710318,  8.46448479, 11.55710318, 12.28084603,
        8.29335234, 10.48507331,  8.00248274, 18.40044092, 11.13

In [38]:
boxcox2

array([ 8.29335234, 13.44665392,  8.25375037,  8.24375485,  8.26370756,
        0.        ,  9.82539017,  9.39625591,  9.85885546, 10.7382352 ,
       10.29558628,  9.91377963,  8.66813014, 12.28084603,  8.21353512,
        8.15201606, 20.99809385, 11.57285872, 10.58631735,  8.15201606,
       16.01158644, 11.07674833, 10.5206788 ,  9.81977449, 10.81142232,
        8.25375037,  0.        ,  8.21353512, 12.75525225,  8.25375037,
        9.78015346,  8.75203272,  9.65192557,  8.89675431, 12.75525225,
       15.65311109, 13.92156699,  9.48619387,  8.22364755,  8.06763831,
       13.92156699, 13.26663469,  8.39925503,  8.05689354,  8.21353512,
       11.23122765,  8.21353512,  8.89675431, 11.63510982, 14.35321037,
        8.18295861, 10.1032914 ,  8.51010722,  8.59905599,  8.21353512,
       16.48041675,  8.06763831,  8.29335234,  8.22364755, 13.08668403,
        8.29335234, 11.55710318,  8.46448479, 11.55710318, 12.28084603,
        8.29335234, 10.48507331,  8.00248274, 18.40044092, 11.13

In [39]:
boxcox3 = boxcox(x=test_df["Medical_Expenses_Family"]+1, alpha=lambd)[0]

In [40]:
boxcox3

array([ 6.20384793,  7.7809676 ,  7.0478716 ,  7.15171714,  7.05527135,
        7.00267962,  7.15171714,  6.29366494,  7.00267962,  6.1972118 ,
        6.29366494,  7.30288769,  7.35510187, 10.48530827,  6.31832763,
        7.67971818,  7.95721782,  9.72982404,  7.98737668,  8.49398558,
        6.93219188,  7.67971818,  8.99675499,  7.21270693, 11.29304846,
        7.98737668, 11.56976531,  6.31832763, 11.57358757,  6.44690491,
        6.1972118 , 11.21536932,  7.05527135,  6.70726701, 11.29304846,
        7.9713656 ,  7.98737668,  6.70726701,  6.31832763,  7.00267962,
        6.2998704 ,  7.30288769,  8.08009715,  7.00267962,  6.1972118 ,
        0.        ,  8.6049313 ,  9.53950822,  8.01695564,  8.26705953,
        9.57573226,  6.29366494,  8.70606474,  6.83263972,  5.12377684,
        6.31832763,  6.04282564, 11.57358757,  6.34257457,  8.01695564,
        6.31832763,  6.2998704 ,  7.98737668,  8.86373805,  6.2998704 ,
        8.01695564,  6.2998704 ,  9.45434532,  7.29348152,  8.96