In [106]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.metrics import accuracy_score

In [115]:
from sklearn.model_selection import train_test_split
df=pd.read_csv("train.csv")
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns="Survived"), df["Survived"], test_size = 0.33)

X_train["CabinNull?"] = X_train.Cabin.isnull()
X_test["CabinNull?"] = X_test.Cabin.isnull()
X_train["Total_Fam"] = X_train.Parch + X_train.SibSp
X_test["Total_Fam"] = X_test.Parch + X_test.SibSp
X_train['Age_bin'] = pd.cut(X_train['Age'], bins=[0,18,65,np.percentile(X_train.Age,100)], labels=["Youth","Adult", "Elderly"])
X_test['Age_bin'] = pd.cut(X_test['Age'], bins=[0,18,65,np.percentile(X_test.Age,100)], labels=["Youth","Adult", "Elderly"])


#Drop the unique columns that won't help our model. Both in the test and train datasets
X_train.drop(columns = ["Name","Ticket", "PassengerId","Cabin","Age"], inplace = True)
X_test.drop(columns = ["Name","Ticket", "PassengerId","Cabin","Age"], inplace = True)

#Impute the numerical variables with mean
X_train["Fare"] = X_train["Fare"].replace(np.NaN, X_train["Fare"].mean())
X_test["Fare"] = X_test["Fare"].replace(np.NaN, X_train["Fare"].mean())

#Binning - Fare
X_train['Fare_bin'] = pd.cut(X_train['Fare'], bins=[np.percentile(X_train.Fare,0),np.percentile(X_train.Fare,25),
                                                    np.percentile(X_train.Fare,50),
                                                    np.percentile(X_train.Fare,75),
                                                    np.percentile(X_train.Fare,100)], labels=["Lowest","Low-to-Mid", "Mid", "High"])
X_test['Fare_bin'] = pd.cut(X_test['Fare'], bins=[np.percentile(X_train.Fare,0),np.percentile(X_train.Fare,25),
                                                   np.percentile(X_train.Fare,50),
                                                   np.percentile(X_train.Fare,75),
                                                   np.percentile(X_train.Fare,100)], labels=["Lowest", "Low-to-Mid", "Mid", "High"])


#Change Passenger class to a string variable instead of numerical
X_train.Pclass = X_train.Pclass.astype(str)
X_test.Pclass = X_test.Pclass.astype(str)

#Impute the Categorical variables
#X_train.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])
#X_test.Cabin = X_train.Cabin.fillna(X_train['Cabin'].value_counts().index[0])

X_train.Embarked = X_train.Embarked.fillna(X_train['Embarked'].value_counts().index[0])
X_test.Embarked = X_test.Embarked.fillna(X_train['Embarked'].value_counts().index[0])

X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

# Feature Selection
## Recursive Feature Elimination
It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

In [116]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
rfe = RFECV(estimator=DecisionTreeClassifier())
rfe = rfe.fit(X_train, y_train)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False  True False  True False False  True False False False False
 False False False]
[ 5  8  1  4  1 11  2  1 12  6  3 13  7  9 10]


In [117]:
for i in range(X_train.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected False, Rank: 5.000
Column: 1, Selected False, Rank: 8.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected False, Rank: 4.000
Column: 4, Selected True, Rank: 1.000
Column: 5, Selected False, Rank: 11.000
Column: 6, Selected False, Rank: 2.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 12.000
Column: 9, Selected False, Rank: 6.000
Column: 10, Selected False, Rank: 3.000
Column: 11, Selected False, Rank: 13.000
Column: 12, Selected False, Rank: 7.000
Column: 13, Selected False, Rank: 9.000
Column: 14, Selected False, Rank: 10.000


In [118]:
X_train = pd.DataFrame(rfe.fit_transform(X_train,y_train), columns = X_train.loc[:,list(rfe.support_)].columns)

In [119]:
X_test = pd.DataFrame(rfe.transform(X_test), columns = X_test.loc[:,list(rfe.support_)].columns)

In [120]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [121]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7932203389830509

In [None]:
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)