In [59]:
import pandas as pd
import numpy as np
import plotly.express as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [60]:
gender=pd.read_csv("gender_submission.csv")
test=pd.read_csv("test.csv")
train=pd.read_csv("train.csv")

# Logistic Imputation of age

The idea here is that since the magnitude of age increases with time (difference between 5-10 yr old is more significant than 65-70 yr old), a logistic regression might be more accurate.

In [104]:

from sklearn.model_selection import train_test_split
df=pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Survived"), df["Survived"], test_size = 0.33, random_state=1)

#Drop the unique columns that won't help our model. Both in the test and train datasets
X_train.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)
X_test.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)

#Impute the numerical variables with mean
X_train["Fare"] = X_train["Fare"].replace(np.NaN, X_train["Fare"].mean())
X_test["Fare"] = X_test["Fare"].replace(np.NaN, X_train["Fare"].mean())

#Change Passenger class to a string variable instead of numerical
X_train.Pclass = X_train.Pclass.astype(str)
X_test.Pclass = X_test.Pclass.astype(str)

#adding log of Age
X_train["Age"] = X_train["Age"].apply(np.log)
X_test["Age"] = X_test["Age"].apply(np.log)

#Binning - Fare
X_train['Fare_bin'] = pd.cut(X_train['Fare'], bins=[np.percentile(X_train.Fare,0),np.percentile(X_train.Fare,33.3),np.percentile(X_train.Fare,66.6),
                              np.percentile(X_train.Fare,100)], labels=["Low", "Mid", "High"])
X_test['Fare_bin'] = pd.cut(X_train['Fare'], bins=[np.percentile(X_train.Fare,0),np.percentile(X_train.Fare,33.3),np.percentile(X_train.Fare,66.6),
                              np.percentile(X_train.Fare,100)], labels=["Low", "Mid", "High"])

#Impute the Categorical variables
#X_train.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])
#X_test.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])

X_train.Embarked = X_train.Embarked.fillna(X_train['Embarked'].value_counts().index[0])
X_test.Embarked = X_test.Embarked.fillna(X_train['Embarked'].value_counts().index[0])

X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

In [108]:
X_train.Age.describe()

count    596.000000
mean       3.219256
std        0.743836
min       -0.400478
25%        3.044522
50%        3.357861
75%        3.658407
max        4.382027
Name: Age, dtype: float64

In [106]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
imp = IterativeImputer(estimator=rf,missing_values=np.nan, max_iter=10, verbose=2, imputation_order='roman',random_state=1)
X_train=pd.DataFrame(imp.fit_transform(X_train),columns = X_train.columns)
X_test=pd.DataFrame(imp.transform(X_test),columns = X_test.columns)

[IterativeImputer] Completing matrix with shape (596, 11)
[IterativeImputer] Ending imputation round 1/10, elapsed time 1.41
[IterativeImputer] Change: 2.151003515095147, scaled tolerance: 0.5123292 
[IterativeImputer] Ending imputation round 2/10, elapsed time 2.82
[IterativeImputer] Change: 0.18878348870863215, scaled tolerance: 0.5123292 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (295, 11)
[IterativeImputer] Ending imputation round 1/2, elapsed time 0.01
[IterativeImputer] Ending imputation round 2/2, elapsed time 0.02


In [107]:
from sklearn.feature_selection import SelectKBest, chi2
bestfeatures = SelectKBest(score_func = chi2, k = 10)
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis = 1)
featureScores.columns = ['Specs','Score']
featureScores

ValueError: Input X must be non-negative.

In [95]:
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)

In [96]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7186440677966102

Monte Carlo Validation

In [97]:
from sklearn.model_selection import ShuffleSplit,cross_val_score
shuffle_split=ShuffleSplit(test_size=0.3,n_splits=1000)
scores=cross_val_score(model,X_train,y_train,cv=shuffle_split)
print("cross Validation scores:n {}".format(scores))
print("Average Cross Validation score :{}".format(scores.mean()))

cross Validation scores:n [0.81564246 0.78212291 0.83240223 0.83798883 0.75418994 0.80446927
 0.78212291 0.80446927 0.77094972 0.79888268 0.77653631 0.7877095
 0.81005587 0.77094972 0.79329609 0.77094972 0.77653631 0.73743017
 0.75977654 0.78212291 0.77094972 0.77653631 0.7877095  0.81564246
 0.77653631 0.77653631 0.75418994 0.77094972 0.76536313 0.82122905
 0.72625698 0.74860335 0.79888268 0.77653631 0.82681564 0.75977654
 0.7877095  0.77094972 0.74301676 0.76536313 0.78212291 0.77653631
 0.78212291 0.80446927 0.79329609 0.79329609 0.81005587 0.75977654
 0.7877095  0.79329609 0.82681564 0.73743017 0.78212291 0.74860335
 0.79888268 0.81005587 0.78212291 0.79329609 0.75977654 0.82122905
 0.77653631 0.77094972 0.78212291 0.76536313 0.76536313 0.74860335
 0.81005587 0.77094972 0.77653631 0.7877095  0.76536313 0.77094972
 0.74860335 0.75977654 0.7877095  0.81564246 0.79888268 0.82681564
 0.81005587 0.74860335 0.75977654 0.7877095  0.75977654 0.75977654
 0.76536313 0.7877095  0.81005587 0.8

In [110]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
imputer = KNNImputer()

In [124]:
from sklearn.model_selection import train_test_split
df=pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Survived"), df["Survived"], test_size = 0.33, random_state=1)

X_train["CabinNull?"] = X_train.Cabin.isnull()
X_test["CabinNull?"] = X_test.Cabin.isnull()

#Drop the unique columns that won't help our model. Both in the test and train datasets
X_train.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)
X_test.drop(columns = ["Name","Ticket", "PassengerId","Cabin"], inplace = True)

#Impute the numerical variables with mean
X_train["Fare"] = X_train["Fare"].replace(np.NaN, X_train["Fare"].mean())
X_test["Fare"] = X_test["Fare"].replace(np.NaN, X_train["Fare"].mean())

#adding log of Age
X_train["Age"] = X_train["Age"].apply(np.log)
X_test["Age"] = X_test["Age"].apply(np.log)

#Binning - Fare
X_train['Fare_bin'] = pd.cut(X_train['Fare'], bins=[np.percentile(X_train.Fare,0),np.percentile(X_train.Fare,33.3),np.percentile(X_train.Fare,66.6),
                              np.percentile(X_train.Fare,100)], labels=["Low", "Mid", "High"])
X_test['Fare_bin'] = pd.cut(X_train['Fare'], bins=[np.percentile(X_train.Fare,0),np.percentile(X_train.Fare,33.3),np.percentile(X_train.Fare,66.6),
                              np.percentile(X_train.Fare,100)], labels=["Low", "Mid", "High"])

#total family members
X_train['total_fam'] = X_train['SibSp'] + X_train['Parch']
X_test['total_fam'] = X_test['SibSp'] + X_test['Parch']

#Change Passenger class to a string variable instead of numerical
X_train.Pclass = X_train.Pclass.astype(str)
X_test.Pclass = X_test.Pclass.astype(str)

#Impute the Categorical variables
#X_train.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])
#X_test.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])

X_train.Embarked = X_train.Embarked.fillna(X_train['Embarked'].value_counts().index[0])
X_test.Embarked = X_test.Embarked.fillna(X_train['Embarked'].value_counts().index[0])

X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

In [125]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

In [126]:
X_train = pd.DataFrame(imputer.fit_transform(X_train),columns = X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns = X_test.columns)

In [127]:
from sklearn.feature_selection import SelectKBest, chi2
bestfeatures = SelectKBest(score_func = chi2, k = 10)
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis = 1)
featureScores.columns = ['Specs','Score']
featureScores

Unnamed: 0,Specs,Score
0,Age,0.428833
1,SibSp,0.078864
2,Parch,3.147825
3,Fare,5.68632
4,CabinNull?,13.376564
5,total_fam,0.402681
6,Pclass_2,6.653592
7,Pclass_3,30.249451
8,Sex_male,67.45491
9,Embarked_Q,0.042507


In [70]:
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)

In [71]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7966101694915254

In [72]:
from sklearn.model_selection import ShuffleSplit,cross_val_score
shuffle_split=ShuffleSplit(test_size=0.3,n_splits=10000)
scores=cross_val_score(model,X_train,y_train,cv=shuffle_split)
print("cross Validation scores:n {}".format(scores))
print("Average Cross Validation score :{}".format(scores.mean()))

cross Validation scores:n [0.7877095  0.75977654 0.73184358 ... 0.77094972 0.69832402 0.77653631]
Average Cross Validation score :0.7723486033519552
