In [44]:
# Import libraries

# Data matipulation
import numpy as np
import pandas as pd

# Visualisation
import matplotlib.pyplot as plt

# Imputation
from sklearn.impute import KNNImputer
from sklearn.impute import MissingIndicator

# Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

# Model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# Metrics
from sklearn.metrics import accuracy_score

In [45]:
# Helper functions

# Imputation

def knn_imputer(df):
    #split predictor on numeric and categorical
    numeric_predictors = df.select_dtypes(include=["int64", "float64"])
    categorical_predictors = df.select_dtypes(include="object")

    #get columns
    numeric_columns = numeric_predictors.columns.values
    categorical_columns = categorical_predictors.columns.values

    #imputation by mean / most frequent
    numeric_predictors = KNNImputer(n_neighbors=5).fit_transform(numeric_predictors)

    # predictor numpy.array to pandas.dataframe
    numeric_predictors = pd.DataFrame(numeric_predictors, columns=numeric_columns)
    categorical_predictors = df[categorical_columns]
    df_imputed = pd.concat([numeric_predictors, categorical_predictors], axis=1)
    return df_imputed


def knn_imputer_ind(df):
    #split predictor on numeric and categorical
    numeric_predictors = df.select_dtypes(include=["int64", "float64"])
    categorical_predictors = df.select_dtypes(include="object")


    #get columns
    numeric_columns = numeric_predictors.columns.values
    categorical_columns = categorical_predictors.columns.values
    
    indicator = MissingIndicator(features="missing-only")
    missing_mask = indicator.fit_transform(numeric_predictors)

    numeric_predictors_miss = numeric_predictors.isna().sum()
    numeric_predictors_miss = numeric_predictors_miss[numeric_predictors_miss != 0].index.values

    miss_list = []
    for col in numeric_predictors_miss:
        miss_list.append(f"{col}_was_misssing")


    indicator_df = pd.DataFrame(missing_mask, columns=miss_list)

    #imputation by mean
    numeric_predictors = KNNImputer(n_neighbors=5).fit_transform(numeric_predictors)

    # predictor numpy.array to pandas.dataframe
    numeric_predictors = pd.DataFrame(numeric_predictors, columns=numeric_columns)
    numeric_predictors = pd.concat([numeric_predictors, indicator_df], axis=1)
    categorical_predictors = df[categorical_columns]
    df_imputed = pd.concat([numeric_predictors, categorical_predictors], axis=1)
    return df_imputed


# Convert categorical features into boolean

def get_dummies_fun(df):
    df = pd.get_dummies(df, drop_first=True)
    return df

def label_encoder_fun(df):
    le = preprocessing.LabelEncoder()
    for predictor in df.columns:
        if df[predictor].dtype == object:
            df[predictor] = le.fit_transform(df[predictor])
    return df


# Scaling

def standardization(x_train, x_test):
    columns = x_train.columns.values
    index = x_train.index
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_train = pd.DataFrame(x_train, columns=columns, index=index)

    columns = x_test.columns.values
    index = x_test.index
    scaler = StandardScaler()
    x_test = scaler.fit_transform(x_test)
    x_test = pd.DataFrame(x_test, columns=columns, index=index)
    return x_train, x_test

def normalization(x_train, x_test):
    columns = x_train.columns.values
    index = x_train.index
    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_train = pd.DataFrame(x_train, columns=columns, index=index)

    columns = x_test.columns.values
    index = x_test.index
    scaler = MinMaxScaler()
    x_test = scaler.fit_transform(x_test)
    x_test = pd.DataFrame(x_test, columns=columns, index=index)
    return x_train, x_test

def no_scaling_fun(x_train, x_test):
    return x_train, x_test


# Features selection

random_search__n_iter = 10
def predictors_selector(x_train, y_train):
    model = LogisticRegression(penalty="l1", max_iter=500, solver="liblinear",)
    pipe = Pipeline([
        ("model", model)
    ])
    model_params = {
        "C": np.linspace(0.00001, 0.1)
    }
    rand_search = RandomizedSearchCV(model, model_params, n_iter=random_search__n_iter)
    rand_search.fit(x_train, y_train)
    best_params = rand_search.best_params_
    best_c = best_params["C"]
    log_reg_model = LogisticRegression(penalty="l1", C=best_c, max_iter=10000, solver="liblinear")
    log_reg_model.fit(x_train, y_train)
    coefs = log_reg_model.coef_
    columns = x_train.columns.values
    non_zero_mask = coefs != 0
    selected_predictors = columns[non_zero_mask[0]]
    x_train = x_train[selected_predictors]
    return x_train

def no_feature_selector(x_train, y_train):
    return x_train

In [46]:
# Import data
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")
df_combine = pd.concat([df_train, df_test])

In [47]:
# First look at data
df_combine.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [48]:
# Shape of data
df_combine.shape

(1309, 12)

In [49]:
# Distribution of numerical data
df_combine.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [50]:
# Distribution of object data
df_combine.describe(include=object)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,1309,1309,1309,295,1307
unique,1307,2,929,186,3
top,"Kelly, Mr. James",male,CA. 2343,C23 C25 C27,S
freq,2,843,11,6,914


In [51]:
# Types of data
df_combine.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [52]:
# Missing data
df_combine.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [53]:
# Drop columns
df_train = df_train.drop("Name", axis=1) # has no meaning 
df_train = df_train.drop("Cabin", axis=1) # high percentage of missing data
df_train = df_train.drop("Ticket", axis=1) # lot of unique string data type
df_train = df_train.drop("PassengerId", axis=1) # has no meaning

# same for test data
df_test = df_test.drop("Name", axis=1) # has no meaning 
df_test = df_test.drop("Cabin", axis=1) # high percentage of missing data
df_test = df_test.drop("Ticket", axis=1) # lot of unique string data type
df_test = df_test.drop("PassengerId", axis=1) # has no meaning

# same for df_combine
df_combine = df_combine.drop("Name", axis=1) # has no meaning 
df_combine = df_combine.drop("Cabin", axis=1) # high percentage of missing data
df_combine = df_combine.drop("Ticket", axis=1) # lot of unique string data type
df_combine = df_combine.drop("PassengerId", axis=1) # has no meaning

In [54]:
# Missing data
df_combine.isna().sum()

Survived    418
Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
Embarked      2
dtype: int64

In [55]:
# Imputation of Age
df_train = knn_imputer(df_train)
df_test = knn_imputer(df_test)

In [56]:
# Missing train data
df_train.isna().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Embarked    2
dtype: int64

In [57]:
# Missing test data
df_test.isna().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Embarked    0
dtype: int64

In [58]:
# Drop remaining rows with missing data
# there is only few missing rows
df_train = df_train.dropna()
df_test = df_test.dropna()

In [59]:
# Missing data
df_train.isna().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Embarked    0
dtype: int64

In [60]:
# Convert categorical features into numerical
df_train = label_encoder_fun(df_train)
df_test = label_encoder_fun(df_test)

In [61]:
# Split data on predictor and responsible feature
x_train = df_train.drop("Survived", axis=1)
y_train = df_train.Survived

# Change df_test on x_test
x_test = df_test

In [62]:
# Scaling
x_train, x_test = standardization(x_train, x_test)

In [63]:
# Set scorring
scorring = "accuracy"

In [65]:
# Features selection:
x_train = predictors_selector(x_train, y_train)