In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('titanic.csv')
data

In [None]:
data.head(10)

# Implementing Decision Tree Model

* Data Wrangling/Data Preparation
* Training and testing
* Evaluating Model : Checking Model Prediction Accuracy

In [None]:
data.dtypes

In [None]:
#Treating missing values with central tendencies
data.isnull().sum()

In [None]:
data['Survived'].value_counts() #The data is slightly imbalanced. More people survived than died.

In [None]:
data.Age.fillna(data.Age.mean(), inplace=True)

In [None]:
data.shape

In [None]:
data.groupby('Cabin').count()

In [None]:
data['Cabin_Group'] = data['Cabin'].str[0]

In [None]:
data.groupby(['Cabin_Group', 'Survived']).count()

In [None]:
data.where(data.Cabin_Group=='NP').groupby('Survived').count()

In [None]:
data.groupby('Cabin_Group').count()

In [None]:
data.Cabin_Group.fillna('NP', inplace = True) #imputing missing values

In [None]:
data.Embarked.fillna('S', inplace = True) #imputing missing values

In [None]:
data.columns

In [None]:
data.groupby('Pclass').count()

In [None]:
data.groupby('Sex').count()

In [None]:
data.groupby('Survived').count()

In [None]:
data.groupby('SibSp').count()

In [None]:
data.groupby('Parch').count()

In [None]:
data.groupby('Cabin_Group').count()

In [None]:
data.groupby('Embarked').count()

In [None]:
data.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True) #dropping which will not relevant to Survival prediction

In [None]:
bins = [0,100,200,300,400,513]
labels = ['Low', 'Medium', 'High', 'Expensive', 'Exorbitant']
data['Fare'] = pd.cut(data['Fare'], bins  = bins, labels = labels) #segmenting the data into groups/bins.

In [None]:
data['Age'] = data['Age'].round() #rounding the age to 1 decimal

Training and Testing

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
data[['Female', 'Male', 'C', 'Q', 'S']] = ohe.fit_transform(data[['Sex', 'Embarked']]) #One hot encoding
data




In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Fare'] = le.fit_transform(data[['Fare']]) #label encoding
data
data['Cabin_Group'] = le.fit_transform(data[['Cabin_Group']]) #label encoding
data.drop(columns = ['Sex', 'Embarked'], inplace = True)
print('Made a Data Analytical Model, and it is ready for training and testing.')
for i in data.columns:
    data[i] = data[i].astype('str')

In [None]:
x = data.iloc[:, 1:]
y = data.iloc[:, 0]

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0) #training and testing
y_test = y_test.astype('int32') #Type casting string to integer otherwise it will give error, because for numerical calculation it takes integer/float only.

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier() #NonParameterized Decision Tree
dtc.fit(x_train, y_train)
pred_test = dtc.predict(x_test)
pred_train = dtc.predict(x_train) #fit Decision Tree Model
pred_test = pred_test.astype('int32')

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
target_names = ['Died : 0', 'Survived : 1']
print(classification_report(pred_test, y_test, target_names = target_names)) #Prediction Accuracy on test data

In [None]:
print(classification_report(pred_train, y_train, target_names = target_names)) #Prediction Accuracy on train data

In [None]:
print(accuracy_score(pred_train, y_train)) #Accuracy metric's accuracy prediction value on train data

In [None]:
print(accuracy_score(pred_test, y_test)) #Accuracy metric's accuracy prediction value on test data

In [None]:
pd.crosstab(pred_train, y_train) #Checking correctly classified and misclassified values using crosstab for train data

In [None]:
print(f"Model's accuracy on train data when predicting person died or 0 : {417/(417+22)}") #Model's accuracy to predict people who died i.e. 0
print(f"Model's accuracy on train data when predicting person survived or 1 :{175/(175+98)}") #Model's accuracy to predict people who died i.e. 1
print('Inference : Model is accurate at predicting person who are died i.e. 0 than person who are survived i.e. 1')

In [None]:
pd.crosstab(pred_test, y_test)#Checking correctly classified and misclassified values using crosstab for train data

In [None]:
print(f"Model's accuracy on test data when predicting person died or 0 : {105/(105+5)}") #Model's accuracy to predict people who died i.e. 0
print(f"Model's accuracy on test data when predicting person survived or 1 :{45/(24+45)}") #Model's accuracy to predict people who died i.e. 1
print('Inference : Model is accurate at predicting person who are died i.e. 0 than person who are survived i.e. 1')

In [None]:
confusion_matrix(pred_train, y_train) #Type II error or Type I #How to is 433 = 0 or 1

In [None]:
confusion_matrix(pred_test, y_test) #Type I error or Type II #How to is 95 = 0 or 1

In [None]:
dtc.get_depth() #Depth of decision tree

In [None]:
dtc.get_params() #Gives all parameters used while building Decision Tree

In [None]:
dtc.predict_proba(x_test) #The model calculates the probability of a class and not as 0 or 1, if it's a hetergenous node then it will give fractional probability and not either 0 or 1, 0 i.e. person died and 1 i.e. person survived.
#If we increase the depth of the tree and as more leaf nodes are created, most of values will have probability either 0 or 1.
#If the probability value is [0.2, 0.7] it means the probability of 0 is 0.2, and 1 is 0.7 as explained,
#[0.8767507 , 0.1232493 ], here probability of 0 is 0.8 for 1st observation of test data, and 1 is 0.12, total probability will be 1
#[0.92857143, 0.07142857], here probability of 0 is 0.92 for 3rd observation of test data, and 1 is 0.07, total probability will be 1
#Remember our ROC curve has given threshold 0.63 it means, if 0th element i.e. class 0 has probability less than 0.63 then it is class 1 similarly
#If class 1 i.e. 2nd element of the list has value more than 0.63 then it is a class 1, if  value less than 0.63 then it is class 0
#For above 3rd observation, the probabilities are [0.92857143, 0.07142857], it means, 0 has 0.9 probability, 1 has 0.07, since threshold is 0.63, and 0 has greater than 0.9 it means it is 0
#Usually the threshold is 0.5 however when the dataset is imbalanced the threshold can vary, we use ROC curve to find the threshold

In [None]:
dtc.feature_importances_ #which feature has highest information gain is found from this attribute

In [None]:
import matplotlib.pyplot as plt #graphical representation of decision tree
from sklearn import tree
plt.figure(figsize = (10,15))
tree.plot_tree(dtc, feature_names = x_train.columns, class_names = data['Survived'], filled =True)

In [None]:
from sklearn.tree import export_text
print(export_text(dtc))

In [None]:
new_record = [[4,12.0, 1, 0, 5, 7, 0, 1, 0, 0, 1]] #Gender is Male
prediction = dtc.predict(new_record)
print('Prediction for new record :', prediction)

new_record = [[4, 12.0, 1, 0, 5, 7,1, 0, 0, 0, 1]]
prediction = dtc.predict(new_record)
print('Prediction for new record :', prediction) #Gender is Female

#Predicting a new record coming outside of train and test

In [None]:
#Finding threshold using RUC Curve and AUC
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(y_test, pd.Series(pred_test))
plt.plot(fpr,tpr) #0.63 is the threshold 


In [None]:
x

# Hyperparameterizing the model Manually & using GridSearchCV

In [None]:
#Hyperparameterizing the model and checking classification report and other performance metrics

In [None]:
dtcmanual = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, min_samples_leaf= 3)
dtcmanual.fit(x_train, y_train)
pred_test1 = dtcmanual.predict(x_test)
pred_train2 = dtcmanual.predict(x_train)

In [None]:
y_test = y_test.astype('str')
print(classification_report(y_test, pred_test1, target_names = target_names)) 
#Recall is classifying person died i.e. 0 with 95% accuracy
#Where recall is classifying person survived with 65% accuracy only
#Precision classifying person died with 81% accuracy
#Where precision is classifying person survived with 90% accuracy.

In [None]:
print(classification_report(y_train, pred_train2, target_names = target_names))
#Recall is classifying person died i.e. 0 with 95% accuracy
#Where recall is classifying person survived with 64% accuracy only
#Precision classifying person died with 81% accuracy
#Where precision is classifying person survived with 89% accuracy.

In [None]:
confusion_matrix(pred_test1, y_test)

In [None]:
confusion_matrix(pred_train2, y_train)

In [None]:
accuracy_score(pred_test1, y_test) #Accuracy score for test data

In [None]:
accuracy_score(pred_train2, y_train) #Accuracy score for train data

Hyperparameterizing the model using GridSearchCV

In [None]:
# import time
# start_time = time.time()
# from sklearn.model_selection import GridSearchCV
# dtcwithgridsearchcv = DecisionTreeClassifier()
# params = {'criterion' : ['gini', 'entropy', 'log_loss'], 'max_depth' : list(range(2,50)), 'min_samples_leaf' : list(range(3,10)), 'min_samples_split' : list(range(2,20))}
# gridsearchcvobject = GridSearchCV(dtcwithgridsearchcv, params, cv = 5, scoring = 'accuracy')
# gridsearchcvobject.fit(x_train, y_train)
# gridsearchcvobject.predict(x_test)
# end_time = time.time()
# run_time = end_time - start_time
# run_time


In [None]:
#-----------------------------------------------------

In [None]:
#-----------------------------------------------------

In [None]:
#-----------------------------------------------------

# Random Forest with default parameters

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
pred_xtrain = rf.predict(x_train)
pred_xtest = rf.predict(x_test)

In [None]:
print(classification_report(pred_xtrain, y_train, target_names = ['Not Survived', 'Survived']))
print(classification_report(pred_xtest, y_test, target_names = ['Not Survivevd', 'S']))

In [None]:
pd.crosstab(y_test, pred_xtest) #Crosstab for x_test data

In [None]:
pd.crosstab(y_train, pred_xtrain) #Crosstab for x_train data

In [None]:
accuracy_score(y_train, pred_xtrain) #Train data's accuracy score

In [None]:
accuracy_score(y_test, pred_xtest) #Test data's accuracy score

# Hyperparameterizing the Random Forest using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
dtcwithgridsearchcv = RandomForestClassifier()
params = {'n_estimators' : list(range(20,50)),
          'criterion' : ['gini', 'entropy', 'log_loss'], 
          'max_features' : ['sqrt', 'log2'],
          'max_depth' : list(range(2,15)),
          'min_samples_leaf' : list(range(3,10)),
          'min_samples_split' : list(range(2,6))
         }
gridsearchcvobject = GridSearchCV(dtcwithgridsearchcv, params, cv = 5, scoring = 'accuracy', n_jobs = -1)
gridsearchcvobject.fit(x_train, y_train)
pred_xtest1 = gridsearchcvobject.predict(x_test)
pred_xtrain1 = gridsearchcvobject.predict(x_train)



In [None]:
print(classification_report(y_test, pred_xtest1))

In [None]:
print(classification_report(y_train, pred_xtrain1))

In [None]:
accuracy_score(y_test, pred_xtest1) #Xtest data's accuracy score

In [None]:
accuracy_score(y_train, pred_xtrain1) #Ytrain data's accuracy score

In [None]:
gridsearchcvobject.best_params_

In [None]:
gridsearchcvobject.best_score_

In [None]:
pd.crosstab(pred_xtest1, y_test)

In [None]:
pd.crosstab(pred_xtrain1, y_train)

In [None]:
gridsearchcvobject.best_estimator_

In [None]:
gridsearchcvobject.param_grid