QUESTION- We will be using a decision tree to make predictions about the Titanic data set from
Kaggle. This data set provides information on the Titanic passengers and can be used to
predict whether a passenger survived or not.

In [66]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [67]:
url="https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"

titanic = pd.read_csv(url)

In [68]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [93]:
titanic.shape

(891, 7)

In [69]:
# filling all the NAN values
titanic.fillna(0,inplace=True)

In [70]:
#Cross checking if there are no NAN in any table
titanic.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [71]:
# Choosing only the columns that are directed as per the question
titanic=titanic[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare']]

In [72]:
titanic.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [73]:
#Since decision tree will not run on any column like Sex that contains string value so we map those values to 0 and 1
s = {'male': 1,'female':0}
titanic[['Sex']]= titanic['Sex'].map(s)
titanic.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [74]:
# Separating the features(X) and the target(y) column
X=titanic[['Pclass','Sex','Age','SibSp','Parch','Fare']].values
y=titanic[['Survived']].values

In [79]:
# Setting up the train test split on the data with 30% test data (out of 891 rows 268 will be test data)
X_Train,X_Test,Y_Train,Y_Test = train_test_split(X,y,test_size = 0.3,random_state = 3)

In [80]:
# Fitting the Decision Tree algorithm to the data
from sklearn.metrics import accuracy_score,recall_score,precision_score
m1 = DecisionTreeClassifier()
m1.fit(X_Train,Y_Train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [107]:
# Checking the accuracy on the training set (gives 74.1% accuracy)
m1.score(X_Train,Y_Train)

0.7415730337078652

In [87]:
#Checking for the most influential feature ( shows out to be Pclass)
print(m1.feature_importances_)
print(titanic.columns)

[0.08327428 0.29329827 0.22297749 0.03169375 0.02958368 0.33917253]
Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


In [88]:
# Checking what percentage of the passengers survived
y.mean()

0.3838383838383838

In [89]:
y_pred=m1.predict(X_Test)

In [90]:
y_pred

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0], dtype=int64)

In [92]:
# Following shows that 38.1% survived
pd.value_counts(y_pred)

0    166
1    102
dtype: int64

In [101]:
pd.crosstab(np.ravel(Y_Test),y_pred)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,164,0
1,2,102


In [106]:
print('Train Accuracy is '+str(100*m1.score(X_Train,Y_Train)))

Train Accuracy is 74.15730337078652


In [102]:
print('Test Accuracy is '+str(100*((164+102)/len(y_pred))))

Test Accuracy is 99.25373134328358


Shows underfitting