In [1]:
import zipfile as zp
from math import ceil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score, GroupShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
warnings.filterwarnings("ignore")

In [2]:
df_list = []
for file in ["train.csv", "test.csv"]:
    with zp.ZipFile("./data.zip") as myzip:
        with myzip.open(file) as myfile:
            df_list.append(pd.read_csv(myfile))
            
train_df, test_df = df_list

In [3]:
# Correcting variables
train_df.rename({"Parents or siblings infected": "Parents_Children_Infected", 
                 "Wife/Husband or children infected": "Partner_Siblings_Infected"}, axis=1, inplace=True)
test_df.rename({"Parents or siblings infected": "Parents_Children_Infected", 
                "Wife/Husband or children infected": "Partner_Siblings_Infected"}, axis=1, inplace=True)

In [4]:
train_df.set_index("Patient_ID", inplace=True)
test_df.set_index("Patient_ID", inplace=True)

In [5]:
X_train = train_df.drop('Deceased', axis=1)
y_train = train_df['Deceased']
X_test = test_df.copy()

In [6]:
X_train.City.fillna(value="Santa Fe", inplace=True)

In [7]:
X_train.Medical_Tent = X_train.Medical_Tent.fillna(value="NK")
test_df.Medical_Tent = test_df.Medical_Tent.fillna(value="NK")

In [8]:
X_train['Title'] = X_train['Name'].str.split('\\W', 1, expand=True)[0]
test_df['Title'] = test_df['Name'].str.split('\\W', 1, expand=True)[0]

In [9]:
X_train['Title_binary'] = X_train['Title'].apply(lambda x: 1 if x in ["Master","Miss"] else 0)
test_df['Title_binary'] = test_df['Title'].apply(lambda x: 1 if x in ["Master","Miss"] else 0)

In [10]:
def scale_function(X_data, columns_to_norm, columns_not_norm, scaler_type, scaler=None):
    '''Function to apply normalization'''
    
    if scaler_type=='MinMaxScaler':
        if scaler is None:
            scaler = MinMaxScaler().fit(X_data[columns_to_norm])
        X_scaled = scaler.transform(X_data[columns_to_norm])
        
    else:
        if scaler is None:
            scaler = StandardScaler().fit(X_data[columns_to_norm])
        X_scaled = scaler.transform(X_data[columns_to_norm])
        
    X = np.append(X_data[columns_not_norm], X_scaled, axis=1)
    X = pd.DataFrame(X, columns = columns_not_norm+columns_to_norm)

    return X, scaler

In [11]:
scaler_type="MinMaxScaler"

In [12]:
def knn_imputer_birthday(X_train, test, scaler_type):
    cols_drop = ['Family_Case_ID','Name','Medical_Expenses_Family','Medical_Tent','City']
    columns_to_norm = ['Severity','Parents_Children_Infected','Partner_Siblings_Infected']
    columns_not_norm=['Birthday_year','Title_binary']
    
    X_use = X_train.copy()
    X_test = test.copy()
    
    fill_birthday_train, scaler = scale_function(X_use.drop(columns=cols_drop), columns_to_norm=columns_to_norm,
                                                 columns_not_norm=columns_not_norm, scaler_type=scaler_type)
    fill_birthday_test, scaler = scale_function(X_test.drop(columns=cols_drop), columns_to_norm=columns_to_norm,
                                                 columns_not_norm=columns_not_norm, scaler_type=scaler_type)
    
    imputer = KNNImputer(n_neighbors=3, weights='distance').fit(fill_birthday_train)
    
    X_imputted_birthday_train = imputer.transform(fill_birthday_train)
    X_imputted_birthday_test = imputer.transform(fill_birthday_test)
    
    X_use.Birthday_year = list(pd.DataFrame(X_imputted_birthday_train).iloc[:,0]) # 0 is the index of Birthday_year
    X_test.Birthday_year = list(pd.DataFrame(X_imputted_birthday_test).iloc[:,0]) # 0 is the index of Birthday_year
    
    return X_use, X_test

In [13]:
X_train, test_df = knn_imputer_birthday(X_train, test_df, scaler_type=scaler_type)

In [14]:
X_train.Birthday_year = X_train.Birthday_year.round(0).astype(int)
test_df.Birthday_year = test_df.Birthday_year.round(0).astype(int)

In [15]:
X_train.head()

Unnamed: 0_level_0,Family_Case_ID,Severity,Name,Birthday_year,Parents_Children_Infected,Partner_Siblings_Infected,Medical_Expenses_Family,Medical_Tent,City,Title,Title_binary
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,4696,3,Miss Linda Betty,2005,0,0,225,NK,Santa Fe,Miss,1
2,21436,1,Ms. Ramona Elvira,1966,0,1,1663,NK,Albuquerque,Ms,0
3,7273,3,Mr. Mario Vernon,1982,0,0,221,NK,Santa Fe,Mr,0
4,8226,3,Mr. Hector Joe,1997,0,0,220,NK,Santa Fe,Mr,0
5,19689,3,Ms. Jennie Debra,1994,0,0,222,NK,Santa Fe,Ms,0


# Pipeline

In [49]:
# This cell holds all the Custom Transformers we designed to pipeline the raw data to the model
class PrepImpute(BaseEstimator, TransformerMixin):
    """
    Creates Title
    Creates Title_binary
    Creates Medical_Tent
    Imputes City with most frequent value
    """
    #Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.city_imputer = SimpleImputer(strategy="most_frequent")
        
    #Return self nothing else to do here
    def fit(self, X, y=None):
        self.city_imputer.fit(X[["City"]])
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X):
        
        X2 = X.copy()
        
        # CREATE NEW VARIABLES
        X2['Title'] = X2['Name'].str.split('\\W', 1, expand=True)[0]
        X2['Title_binary'] = X2['Title'].apply(lambda x: 1 if x in ["Master","Miss"] else 0)
        X2["Medical_Tent"] = X2["Medical_Tent"].fillna(value="NK")
        X2['City'] = self.city_imputer.transform(X2[["City"]])
        
        return X2 
    

class KNNImputerScaled(KNNImputer):
    """
    KNNImputer subclass that scales variables before applying imputation
    Impute Birthday_year using scaled and non-scaled variables with KNNImputer
    """
    def __init__(self, missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False,
                 columns_to_norm=['Severity','Parents_Children_Infected','Partner_Siblings_Infected'], columns_not_to_norm=['Birthday_year','Title_binary'],
                 scaler=MinMaxScaler(), **scaler_args):
        super().__init__(missing_values=missing_values, 
                         n_neighbors=n_neighbors, 
                         weights=weights,
                         metric=metric,
                         copy=copy, 
                         add_indicator=add_indicator)
        self.scaler = scaler.set_params(**scaler_args)
        self.columns_to_norm = columns_to_norm
        self.columns_not_to_norm = columns_not_to_norm
        
    def fit(self, X, y=None):
        X_scaled = self.scaler.fit_transform(X[self.columns_to_norm])
        X_trans = np.concatenate([X_scaled, X[self.columns_not_to_norm]], axis=1)
        super().fit(X_trans)
        return self
    
    def transform(self, X, y=None):
        X_scaled = self.scaler.transform(X[self.columns_to_norm])
        X_trans = np.concatenate([X_scaled, X[self.columns_not_to_norm]], axis=1)
        X_imputed_scaled = pd.DataFrame(super().transform(X_trans), index=X.index, columns=self.columns_to_norm + self.columns_not_to_norm)
        X_imputed = X.copy()
        X_imputed["Birthday_year"] = X_imputed_scaled["Birthday_year"].round(0).astype(int)
        
        return X_imputed
    
    
class FeatureEngineering(BaseEstimator, TransformerMixin):
    """
    Creates Age
    Creates Gender
    Ceates Parents_Children_Infected_Binary
    Creates Partner_Siblings_Infected_Binary
    Creates Medical_Tent_Binary
    Creates Family_Infected
    Creates Family_Infected_Binary
    Creates Family_Deceased
    Creates Family_Deceased_Reduced
    Creates Medical_Expenses_Individual
    """
    #Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass    
        
    #Return self nothing else to do here
    def fit(self, X, y):
        self.familycount_ = X.Family_Case_ID.value_counts().to_dict()
        X_train_w_dec = pd.concat([X, y], axis=1)
        self.deceasedfamcount_ = X_train_w_dec.loc[X_train_w_dec['Deceased']==1].Family_Case_ID.value_counts().to_dict()
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X):
        X2 = X.copy()
        # CREATE NEW VARIABLES
        X2['Age'] = X2['Birthday_year'].map(lambda x: 2020 - x)
        X2['Gender'] = X2['Title'].map(lambda x: "Male" if x in ["Mr", "Master"] else "Female")
        X2['Parents_Children_Infected_Binary'] = X2['Parents_Children_Infected'].map(lambda x: 0 if x==0 else 1)  
        X2['Partner_Siblings_Infected_Binary'] = X2['Partner_Siblings_Infected'].map(lambda x: 0 if x==0 else 1)
        X2["Medical_Tent_Binary"] = X2["Medical_Tent"].map(lambda x: x if x == "NK" else "K")
        #X2["Pediatric_Binary"] = X2["Age"].map(lambda x: 1 if x < 18 else 0)
        #X2["3rd_Age_Binary"] = X2["Age"].map(lambda x: 1 if x >= 65 else 0)
        X2["Family_Infected"] = X2["Family_Case_ID"].map(self.familycount_)
        X2["Family_Infected_Binary"] = X2["Family_Infected"].map(lambda x: 0 if x==1 else 1)
        X2["Family_Deceased"] = (X2["Family_Case_ID"].map(self.deceasedfamcount_)-1).fillna(0)
        X2["Family_Deceased_Reduced"] = X2['Family_Deceased'].map(lambda x: x if x==0 else (2 if x>=3 else 1))
        #X2["Dead_infected_ratio_family"] = X2["Family_Deceased"] / X2["Family_Infected"]
        X2["Medical_Expenses_Individual"] = X2["Medical_Expenses_Family"] / X2["Family_Infected"]
        # FURTHER TRANSFORMATIONS
        X2["Medical_Expenses_Individual"][X2["Medical_Expenses_Individual"] > 3000] = 3000
        
        return X2 
    
    
class Transformations(BaseEstimator, TransformerMixin):
    """
    Transforms Medical_Expenses_Individual by applying Box-Cox power transformation
    Drops unnecessary variables
    """
    #Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass
        
    #Return self nothing else to do here
    def fit(self, X, y=None):
        self.lambda_ = boxcox(X["Medical_Expenses_Individual"] + 1)[1]
        return self

    #Transformer method we wrote for this transformer 
    def transform(self, X):
        X["Medical_Expenses_Individual"] = boxcox(X["Medical_Expenses_Individual"] + 1, alpha=self.lambda_)[0]
        # DROP INTERMEDIATE VARIABLES
        X = X.drop(["Family_Case_ID", "Name", "Birthday_year", "Title", "Title_binary", "Partner_Siblings_Infected", 
                    "Medical_Tent", "Parents_Children_Infected", "Partner_Siblings_Infected_Binary", 
                    "Parents_Children_Infected_Binary", "Family_Deceased", "Family_Infected", "Medical_Expenses_Family"], axis=1)
        
        return X 

In [17]:
X_train_pipe = train_df.drop('Deceased', axis=1)
y_train_pipe = train_df['Deceased']

In [18]:
# pi = PrepImpute()
# X_trans = pi.fit_transform(X_train_pipe)
# X_trans

In [19]:
# knnimp = KNNImputerScaled(n_neighbors=3, weights='distance', scaler=MinMaxScaler)
# knnimp.fit(X_trans)
# X_scaled = knnimp.transform(X_trans)
# X_scaled

In [20]:
# # Checking that KNNImputerScaled gets the same output as the KNNImputer used outside pipeline
# (X_train.loc[train_df["Birthday_year"].isna(), "Birthday_year"] == X_scaled.loc[train_df["Birthday_year"].isna(), "Birthday_year"]).all()

In [21]:
# boxcox1, lambd = boxcox(x=X_train["Medical_Expenses_Family"]+1)
# boxcox2 = boxcox(x=(X_train["Medical_Expenses_Family"]+1), alpha=lambd)[0]
# boxcox1, boxcox2

In [22]:
# boxcox3 = boxcox(x=test_df["Medical_Expenses_Family"]+1, alpha=lambd)[0]
# boxcox3

In [35]:
# Data Cleaning Pipeline
clean_pipeline = Pipeline([
    ('prepimpute', PrepImpute()),
    ('knnimputer', KNNImputerScaled()),  # tune: n_neighbors, weights, scaler
    ('engineering', FeatureEngineering()),
#     ('transformation', Transformations())
])

# Spliting features
ohe_columns = ["City", "Gender", "Medical_Tent_Binary", "Family_Infected_Binary"]
scale_columns = ["Severity", "Age", "Family_Deceased_Reduced", "Medical_Expenses_Individual"]

# Combining features and applying different transformations
join_pipeline = ColumnTransformer([('scaler', "passthrough", scale_columns),
                                   ('ohe', OneHotEncoder(sparse=False), ohe_columns)])  # tune: drop

# Feature Selection
fselect1 = SelectFromModel(LogisticRegression(penalty="l1", max_iter=400, multi_class="multinomial", solver="saga", n_jobs=-1, random_state=1))  # tune: estimator__C
fselect2 = NeighborhoodComponentsAnalysis(max_iter=25, tol=0.005, random_state=1)  # tune: n_components, tol=0.005 (it takes to long)
fselect3 = SelectFromModel(LogisticRegression(penalty="l2", max_iter=400,  multi_class="multinomial", solver="saga", n_jobs=-1, random_state=1))  # tune: estimator__C, threshold

# Full Preprocessing Pipeline
prep_pipeline = Pipeline([
    ("clean", clean_pipeline),
    ("join", join_pipeline),
    ("fselect1", "passthrough"),
    ("fselect2", "passthrough")
])

# Model
lr = LogisticRegression(max_iter=400, n_jobs=-1, random_state=1)
knn = KNeighborsClassifier(n_jobs=-1)

# Full Model Pipeline
full_pipeline = Pipeline([
    ("prep", prep_pipeline),
    ("model", "passthrough")
])

grid = [
    {"prep__clean__knnimputer__n_neighbors": [3, 5, 10, 15],
     "prep__clean__knnimputer__weights": ["uniform", "distance"],
     "prep__clean__knnimputer__scaler": [StandardScaler(), MinMaxScaler()],
     "prep__join__scaler": [StandardScaler(), MinMaxScaler()],
     "prep__join__ohe__drop": [None, "first"],  # define categories to drop
     "prep__fselect1": [None],
     "prep__fselect2": [None],
     "model": [knn],
     "model__n_neighbors": [5, 10, 20],
     "model__weights": ["uniform", "distance"]
    }
]

In [42]:
clean_pipeline.fit(X_train_pipe, y_train_pipe)
X_class = clean_pipeline.transform(X_train_pipe)
# X_class = pd.DataFrame(X_class, index=X_train_pipe.index, columns=scale_columns + list(full_pipeline.named_steps["prep"].named_steps["join"].named_transformers_["ohe"].get_feature_names(ohe_columns)))
X_class

Unnamed: 0_level_0,Family_Case_ID,Severity,Name,Birthday_year,Parents_Children_Infected,Partner_Siblings_Infected,Medical_Expenses_Family,Medical_Tent,City,Title,...,Age,Gender,Parents_Children_Infected_Binary,Partner_Siblings_Infected_Binary,Medical_Tent_Binary,Family_Infected,Family_Infected_Binary,Family_Deceased,Family_Deceased_Reduced,Medical_Expenses_Individual
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4696,3,Miss Linda Betty,2003,0,0,225,NK,Santa Fe,Miss,...,17,Female,0,0,NK,1,0,0.0,0.0,225.000000
2,21436,1,Ms. Ramona Elvira,1966,0,1,1663,NK,Albuquerque,Ms,...,54,Female,0,1,NK,1,0,0.0,0.0,1663.000000
3,7273,3,Mr. Mario Vernon,1982,0,0,221,NK,Santa Fe,Mr,...,38,Male,0,0,NK,1,0,0.0,0.0,221.000000
4,8226,3,Mr. Hector Joe,1997,0,0,220,NK,Santa Fe,Mr,...,23,Male,0,0,NK,1,0,0.0,0.0,220.000000
5,19689,3,Ms. Jennie Debra,1994,0,0,222,NK,Santa Fe,Ms,...,26,Female,0,0,NK,1,0,0.0,0.0,222.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,6253,3,Ms. Linda Wilcox,1998,1,1,344,NK,Santa Fe,Ms,...,22,Female,1,1,NK,2,1,0.0,0.0,172.000000
897,6483,3,Mr. Haiden Vance,2006,0,0,258,NK,Santa Fe,Mr,...,14,Male,0,0,NK,1,0,0.0,0.0,258.000000
898,981,3,Miss Anaiya Love,1990,0,0,214,NK,Taos,Miss,...,30,Female,0,0,NK,1,0,0.0,0.0,214.000000
899,16418,2,Mr. Robert Williams,1994,1,1,812,NK,Santa Fe,Mr,...,26,Male,1,1,NK,3,1,0.0,0.0,270.666667


In [43]:
X_class.isna().sum()

Family_Case_ID                      0
Severity                            0
Name                                0
Birthday_year                       0
Parents_Children_Infected           0
Partner_Siblings_Infected           0
Medical_Expenses_Family             0
Medical_Tent                        0
City                                0
Title                               0
Title_binary                        0
Age                                 0
Gender                              0
Parents_Children_Infected_Binary    0
Partner_Siblings_Infected_Binary    0
Medical_Tent_Binary                 0
Family_Infected                     0
Family_Infected_Binary              0
Family_Deceased                     0
Family_Deceased_Reduced             0
Medical_Expenses_Individual         0
dtype: int64

In [44]:
X_test.isna().sum()

Family_Case_ID                 0
Severity                       0
Name                           0
Birthday_year                 82
Parents_Children_Infected      0
Partner_Siblings_Infected      0
Medical_Expenses_Family        0
Medical_Tent                 311
City                           0
dtype: int64

In [46]:
X_test_class = clean_pipeline.transform(X_test)
X_test_class

Unnamed: 0_level_0,Family_Case_ID,Severity,Name,Birthday_year,Parents_Children_Infected,Partner_Siblings_Infected,Medical_Expenses_Family,Medical_Tent,City,Title,...,Age,Gender,Parents_Children_Infected_Binary,Partner_Siblings_Infected_Binary,Medical_Tent_Binary,Family_Infected,Family_Infected_Binary,Family_Deceased,Family_Deceased_Reduced,Medical_Expenses_Individual
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
901,49242,3,Mr. Jody Pedro,1991,0,0,203,NK,Santa Fe,Mr,...,29,Male,0,0,NK,,1,0.0,0.0,
902,10400,3,Mr. Kevin Brent,1988,0,0,631,NK,Santa Fe,Mr,...,32,Male,0,0,NK,1.0,0,0.0,0.0,631.0
903,10795,3,Mr. Frankie Cary,1981,1,0,376,NK,Albuquerque,Mr,...,39,Male,1,0,NK,1.0,0,0.0,0.0,376.0
904,62440,3,Mr. Rick Pete,1994,0,1,405,NK,Albuquerque,Mr,...,26,Male,0,1,NK,,1,0.0,0.0,
905,81311,2,Mr. Matthew Erick,1996,0,0,378,NK,Santa Fe,Mr,...,24,Male,0,0,NK,,1,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,110522,3,Mr. Luther Rogelio,1991,0,0,221,NK,Santa Fe,Mr,...,29,Male,0,0,NK,,1,0.0,0.0,
1297,118768,3,Mr. Emanuel Ruben,1991,0,0,202,NK,Albuquerque,Mr,...,29,Male,0,0,NK,,1,0.0,0.0,
1298,86158,1,Mrs. Misty Camille,1994,0,1,3830,C,Albuquerque,Mrs,...,26,Female,0,1,K,,1,0.0,0.0,
1299,18523,3,Master Gustavo Jordan,2007,2,0,567,NK,Santa Fe,Master,...,13,Male,1,0,NK,2.0,1,0.0,0.0,283.5


In [48]:
X_test_class.isna().sum()

Family_Case_ID                        0
Severity                              0
Name                                  0
Birthday_year                         0
Parents_Children_Infected             0
Partner_Siblings_Infected             0
Medical_Expenses_Family               0
Medical_Tent                          0
City                                  0
Title                                 0
Title_binary                          0
Age                                   0
Gender                                0
Parents_Children_Infected_Binary      0
Partner_Siblings_Infected_Binary      0
Medical_Tent_Binary                   0
Family_Infected                     253
Family_Infected_Binary                0
Family_Deceased                       0
Family_Deceased_Reduced               0
Medical_Expenses_Individual         253
dtype: int64

In [27]:
# Instatiating GridSearch
gscv = GridSearchCV(full_pipeline, grid, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=1), scoring='accuracy', verbose=1, n_jobs=-1)

# Get ID of grid search
id_num = input("Insert GridSearch ID number: ")

# Grid Search and model training
gscv.fit(X_train_pipe, y_train_pipe)

# Obtain outputs from Grid Search
# gs_outputs(id_num, gscv, grid)

Insert GridSearch ID number:  1


Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').