## Dropping Age on further models to see if it truly increases Accuracy

In [1]:
import pandas as pd
import numpy as np
import plotly.express as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [3]:
from sklearn.model_selection import train_test_split
df=pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Survived"), df["Survived"], test_size = 0.33, random_state=1)

X_train["CabinNull?"] = X_train.Cabin.isnull()
X_test["CabinNull?"] = X_test.Cabin.isnull()

#Drop the unique columns that won't help our model. Both in the test and train datasets
X_train.drop(columns = ["Name","Ticket", "PassengerId","Cabin","Age"], inplace = True)
X_test.drop(columns = ["Name","Ticket", "PassengerId","Cabin","Age"], inplace = True)

#Impute the numerical variables with mean
X_train["Fare"] = X_train["Fare"].replace(np.NaN, X_train["Fare"].mean())
X_test["Fare"] = X_test["Fare"].replace(np.NaN, X_train["Fare"].mean())

#Change Passenger class to a string variable instead of numerical
X_train.Pclass = X_train.Pclass.astype(str)
X_test.Pclass = X_test.Pclass.astype(str)

#Impute the Categorical variables
#X_train.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])
#X_test.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])

X_train.Embarked = X_train.Embarked.fillna(X_train['Embarked'].value_counts().index[0])
X_test.Embarked = X_test.Embarked.fillna(X_train['Embarked'].value_counts().index[0])

X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

In [45]:
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)

In [46]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7661016949152543

This was a further investigation of Age didn't matter and it seems to hold up with some of our later models. I think that dropping Age is the way to go.

## Imputing Age through KNN Model

In [2]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
imputer = KNNImputer()

In [3]:
from sklearn.model_selection import train_test_split
df=pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Survived"), df["Survived"], test_size = 0.33, random_state=1)

X_train["CabinNull?"] = X_train.Cabin.isnull()
X_test["CabinNull?"] = X_test.Cabin.isnull()

#Drop the unique columns that won't help our model. Both in the test and train datasets
X_train.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)
X_test.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)

#Impute the numerical variables with mean
X_train["Fare"] = X_train["Fare"].replace(np.NaN, X_train["Fare"].mean())
X_test["Fare"] = X_test["Fare"].replace(np.NaN, X_train["Fare"].mean())



#Change Passenger class to a string variable instead of numerical
X_train.Pclass = X_train.Pclass.astype(str)
X_test.Pclass = X_test.Pclass.astype(str)

#Impute the Categorical variables
#X_train.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])
#X_test.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])

X_train.Embarked = X_train.Embarked.fillna(X_train['Embarked'].value_counts().index[0])
X_test.Embarked = X_test.Embarked.fillna(X_train['Embarked'].value_counts().index[0])

X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

In [4]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [5]:
X_train = pd.DataFrame(imputer.fit_transform(X_train),columns = X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns = X_test.columns)

In [6]:
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)

In [7]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7694915254237288

## Stratified K-fold Cross Validation

In [12]:
from sklearn.model_selection import cross_val_score,StratifiedKFold
stratifiedkf=StratifiedKFold(n_splits=5)
score=cross_val_score(model,X_train,y_train,cv=stratifiedkf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores are [0.76666667 0.69747899 0.81512605 0.76470588 0.78991597]
Average Cross Validation score :0.7667787114845938


Pros: Runs faster. It is stratified. Won't be a perfect cross-validation check, but does the trick so that you aren't just testing against one subset of the data.

Cons: You are only doing K sections. So with k/100% of your data not being used, you might be losing some of your valuable data within each split that will help train your model.

## Leave P out cross-validation

In [13]:
from sklearn.model_selection import LeavePOut,cross_val_score
lpo=LeavePOut(p=2)
lpo.get_n_splits(X_train)
score=cross_val_score(model,X_train,y_train,cv=lpo)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores are [0.5 1.  1.  ... 0.5 1.  0.5]
Average Cross Validation score :0.7842930460774914


Pros: All the data samples get used as both training and validation samples

Cons: High computation time

## Leave One Out Cross-Validation


Leave one out cross-validation is a special case of Leave P out cross-validation where P=1

In [14]:
from sklearn.model_selection import LeaveOneOut,cross_val_score
loo=LeaveOneOut()
score=cross_val_score(model,X_train,y_train,cv=loo)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores are [1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0.
 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1.
 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.
 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 1. 1. 1

## Monte Carlo Cross-Validation (Shuffle Split Cross Validation)

The datasets get randomly partitioned into training and validation sets. I think that I like this method the most.

Cons: There is the possibility that you get samples that are not selected for either training or validation sets.

In [8]:
from sklearn.model_selection import ShuffleSplit,cross_val_score
shuffle_split=ShuffleSplit(test_size=0.3,n_splits=10000)
scores=cross_val_score(model,X_train,y_train,cv=shuffle_split)
print("cross Validation scores:n {}".format(scores))
print("Average Cross Validation score :{}".format(scores.mean()))

cross Validation scores:n [0.77094972 0.75977654 0.81005587 ... 0.77653631 0.81005587 0.78212291]
Average Cross Validation score :0.7749357541899441


## Feature Engineering

In [2]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
imputer = KNNImputer()

In [9]:
from sklearn.model_selection import train_test_split
df=pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Survived"), df["Survived"], test_size = 0.33, random_state=1)

X_train["CabinNull?"] = X_train.Cabin.isnull()
X_test["CabinNull?"] = X_test.Cabin.isnull()

#Drop the unique columns that won't help our model. Both in the test and train datasets
X_train.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)
X_test.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)

#Impute the numerical variables with mean
X_train["Fare"] = X_train["Fare"].replace(np.NaN, X_train["Fare"].mean())
X_test["Fare"] = X_test["Fare"].replace(np.NaN, X_train["Fare"].mean())



#Change Passenger class to a string variable instead of numerical
X_train.Pclass = X_train.Pclass.astype(str)
X_test.Pclass = X_test.Pclass.astype(str)

#Impute the Categorical variables
#X_train.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])
#X_test.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])

X_train.Embarked = X_train.Embarked.fillna(X_train['Embarked'].value_counts().index[0])
X_test.Embarked = X_test.Embarked.fillna(X_train['Embarked'].value_counts().index[0])

X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

In [11]:
X_train.Fare.describe()

count    596.000000
mean      31.276104
std       46.497266
min        0.000000
25%        7.895800
50%       14.054150
75%       30.017700
max      512.329200
Name: Fare, dtype: float64

In [4]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [5]:
X_train = pd.DataFrame(imputer.fit_transform(X_train),columns = X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns = X_test.columns)

In [6]:
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)

In [7]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7694915254237288

## Using GridSearch and Pipelines to hypertune our models (I think I'm jumping the gun a bit here.

In [19]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

splitter = ['best','random']
max_features = ["auto", "sqrt", "log2"]

In [20]:
dt_pipe = Pipeline(
    [('dt', DecisionTreeClassifier())])
param_grid = [{'dt__max_features':max_features,
              'dt__max_depth':max_depth,
              'dt__min_samples_split':min_samples_split,
              'dt__min_samples_leaf':min_samples_leaf,
              'dt__splitter':splitter}]
gs = GridSearchCV(dt_pipe, param_grid)
gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('dt', DecisionTreeClassifier())]),
             param_grid=[{'dt__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                            100, 110, None],
                          'dt__max_features': ['auto', 'sqrt', 'log2'],
                          'dt__min_samples_leaf': [1, 2, 4],
                          'dt__min_samples_split': [2, 5, 10],
                          'dt__splitter': ['best', 'random']}])

In [21]:
gs.best_params_

{'dt__max_depth': 40,
 'dt__max_features': 'sqrt',
 'dt__min_samples_leaf': 2,
 'dt__min_samples_split': 10,
 'dt__splitter': 'random'}

In [25]:
dt = DecisionTreeClassifier(max_depth = 40, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 10, splitter = 'random')
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

0.735593220338983

In [27]:
from sklearn.model_selection import ShuffleSplit,cross_val_score
shuffle_split=ShuffleSplit(test_size=0.3,n_splits=100)
scores=cross_val_score(dt,X_train,y_train,cv=shuffle_split)
print("cross Validation scores:n {}".format(scores))
print("Average Cross Validation score :{}".format(scores.mean()))

cross Validation scores:n [0.80446927 0.81564246 0.7877095  0.79888268 0.84357542 0.81564246
 0.83240223 0.79329609 0.81564246 0.7877095  0.73184358 0.82681564
 0.79329609 0.81564246 0.76536313 0.72067039 0.82681564 0.7877095
 0.80446927 0.77094972 0.81564246 0.87150838 0.83798883 0.81005587
 0.79329609 0.82681564 0.7877095  0.77094972 0.82122905 0.83240223
 0.82122905 0.77094972 0.8603352  0.82681564 0.8547486  0.77094972
 0.79888268 0.83240223 0.79888268 0.79888268 0.8603352  0.80446927
 0.77094972 0.83798883 0.8547486  0.82122905 0.82122905 0.82122905
 0.73743017 0.74301676 0.83798883 0.84357542 0.82122905 0.83798883
 0.75977654 0.79888268 0.84916201 0.77653631 0.81564246 0.83798883
 0.81005587 0.74301676 0.82681564 0.84916201 0.84357542 0.84357542
 0.84357542 0.83240223 0.77653631 0.82681564 0.80446927 0.79329609
 0.79888268 0.78212291 0.83240223 0.78212291 0.81005587 0.83798883
 0.81564246 0.8547486  0.8547486  0.7877095  0.8547486  0.77653631
 0.79329609 0.82122905 0.70391061 0.7

In [12]:
X_train

Unnamed: 0,Age,SibSp,Parch,Fare,CabinNull?,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
496,54.0,1,0,78.2667,False,0,0,0,0,0
14,14.0,0,0,7.8542,True,0,1,0,0,1
82,,0,0,7.7875,True,0,1,0,1,0
657,32.0,1,1,15.5000,True,0,1,0,1,0
388,,0,0,7.7292,True,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...
715,19.0,0,0,7.6500,False,0,1,1,0,1
767,30.5,0,0,7.7500,True,0,1,0,1,0
72,21.0,0,0,73.5000,True,1,0,1,0,1
235,,0,0,7.5500,True,0,1,0,0,1
