In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#importing dataset
titanic_dt = pd.read_csv('train.csv')

In [None]:
titanic_dt.head()

In [None]:
#missing data check
titanic_dt.isna().sum()

In [None]:
#null values watching in a heatmap 
sns.heatmap(titanic_dt.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
# checking if the dataset is balanced with how many people survived and how many did not
sns.set_style('whitegrid')
sns.countplot(x='Survived',data=titanic_dt)

In [None]:
# checking survival rate of male and female
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=titanic_dt,palette='RdBu_r')

In [None]:
# checking survival rate of passenger classes
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=titanic_dt,palette='rainbow')

In [None]:
# checking age of passenger in historgram
titanic_dt['Age'].hist(bins=30,color='darkred',alpha=0.3)

In [None]:
# checking the graph of sibling and spouses
sns.countplot(x='SibSp',data=titanic_dt)

In [None]:
# checking the graph of fare in historgram
titanic_dt['Fare'].hist(color='green',bins=40,figsize=(8,4))

In [None]:
# Removing Null values from the dataset
#ploting a graph of passenger age and class where middle line is the average passenger age
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=titanic_dt,palette='winter')

In [None]:
# defining age according to passenger class
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age): #enter loop if age is null

        if Pclass == 1:
            return 35 #returning 35 becasuse for pclass 1 average age is 37

        elif Pclass == 2:
            return 28 #returning 28 becasuse for pclass 2 average age is 29

        else:
            return 23 #returning 23 becasuse for pclass 3 average age is 24

    else:
        return Age #returning the actual age as age is cannot be null

In [None]:
#now applying the fucntion into dataset to replace null values of age
titanic_dt['Age'] = titanic_dt[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
#now checking the heatmap again to compare
sns.heatmap(titanic_dt.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#we will drop the cabin as there are too many missing entries
titanic_dt.drop('Cabin',axis=1,inplace=True)

In [None]:
#checking if cabin is dropped or not
titanic_dt.head()

In [None]:
#one hot encoding
pd.get_dummies(titanic_dt['Embarked'],drop_first=True).head()

In [None]:
#removing Sex and Embarked Feature
sex = pd.get_dummies(titanic_dt['Sex'],drop_first=True)
embark = pd.get_dummies(titanic_dt['Embarked'],drop_first=True)

In [None]:
titanic_dt.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
#cheking if features removed or not
titanic_dt.head()

In [None]:
titanic_dt = pd.concat([titanic_dt,sex,embark],axis=1)
titanic_dt.head()

In [None]:
#droping survive column as it is my dependet feature
titanic_dt.drop('Survived',axis=1).head()

In [None]:
#output data
titanic_dt['Survived'].head()

In [None]:
from sklearn.model_selection import train_test_split

#model validation
from sklearn.metrics import log_loss,roc_auc_score,precision_score,f1_score,recall_score,roc_curve,auc
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,fbeta_score,matthews_corrcoef
from sklearn import metrics

# cross validation
from sklearn.model_selection import StratifiedKFold

# machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier,ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC 

In [None]:
#train test spliting
X_train, X_test, y_train, y_test = train_test_split(titanic_dt.drop('Survived',axis=1),titanic_dt['Survived'],test_size=0.30,random_state=5)


In [None]:
#Decision Tree Model

In [None]:
decc = DecisionTreeClassifier()
decc.fit(X_train,y_train)
y_pred_decc = decc.predict(X_test)

In [None]:
CM=confusion_matrix(y_test,y_pred_decc)
sns.heatmap(CM, annot=True)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
specificity = TN/(TN+FP)
loss_log = log_loss(y_test, y_pred_decc)
acc= accuracy_score(y_test, y_pred_decc)
roc=roc_auc_score(y_test, y_pred_decc)
prec = precision_score(y_test, y_pred_decc)
rec = recall_score(y_test, y_pred_decc)
f1 = f1_score(y_test, y_pred_decc)

mathew = matthews_corrcoef(y_test, y_pred_decc)
model_results =pd.DataFrame([['Decision Tree',acc, prec,rec,specificity, f1,roc, loss_log,mathew]],
               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score','ROC','Log_Loss','mathew_corrcoef'])

model_results

In [None]:
#Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb.fit(X_train,y_train)
y_pred_gb = gb.predict(X_test)

In [None]:
CM=confusion_matrix(y_test,y_pred_gb)
sns.heatmap(CM, annot=True)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
specificity = TN/(TN+FP)
loss_log = log_loss(y_test, y_pred_gb)
acc= accuracy_score(y_test, y_pred_gb)
roc=roc_auc_score(y_test, y_pred_gb)
prec = precision_score(y_test, y_pred_gb)
rec = recall_score(y_test, y_pred_gb)
f1 = f1_score(y_test, y_pred_gb)

mathew = matthews_corrcoef(y_test, y_pred_gb)
model_results =pd.DataFrame([['Naive Bayes',acc, prec,rec,specificity, f1,roc, loss_log,mathew]],
               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score','ROC','Log_Loss','mathew_corrcoef'])

model_results

In [None]:
#SVM Model

In [None]:
svc = SVC(kernel='linear',gamma='auto',probability=True)
svc.fit(X_train,y_train)
y_pred_svc = svc.predict(X_test)

In [None]:
CM=confusion_matrix(y_test,y_pred_svc)
sns.heatmap(CM, annot=True)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
specificity = TN/(TN+FP)
loss_log = log_loss(y_test, y_pred_svc)
acc= accuracy_score(y_test, y_pred_svc)
roc=roc_auc_score(y_test, y_pred_svc)
prec = precision_score(y_test, y_pred_svc)
rec = recall_score(y_test, y_pred_svc)
f1 = f1_score(y_test, y_pred_svc)

mathew = matthews_corrcoef(y_test, y_pred_svc)
model_results =pd.DataFrame([['Support Vector Classifier Linear',acc, prec,rec,specificity, f1,roc, loss_log,mathew]],
               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score','ROC','Log_Loss','mathew_corrcoef'])

model_results

In [None]:
#Random Forest Model

In [None]:
rf_ent = RandomForestClassifier(criterion='entropy',n_estimators=100)
rf_ent.fit(X_train, y_train)
y_pred_rfe = rf_ent.predict(X_test)

In [None]:
CM=confusion_matrix(y_test,y_pred_rfe)
sns.heatmap(CM, annot=True)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
specificity = TN/(TN+FP)
loss_log = log_loss(y_test, y_pred_rfe)
acc= accuracy_score(y_test, y_pred_rfe)
roc=roc_auc_score(y_test, y_pred_rfe)
prec = precision_score(y_test, y_pred_rfe)
rec = recall_score(y_test, y_pred_rfe)
f1 = f1_score(y_test, y_pred_rfe)

mathew = matthews_corrcoef(y_test, y_pred_lgreg)
model_results =pd.DataFrame([['Random Forest',acc, prec,rec,specificity, f1,roc, loss_log,mathew]],
               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score','ROC','Log_Loss','mathew_corrcoef'])

model_results

In [None]:
#Logistic Regression Model

In [None]:
lg_reg = LogisticRegression(penalty='l2')
lg_reg.fit(X_train,y_train)
y_pred_lgreg = lg_reg.predict(X_test)

In [None]:
CM=confusion_matrix(y_test,y_pred_lgreg)
sns.heatmap(CM, annot=True)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
specificity = TN/(TN+FP)
loss_log = log_loss(y_test, y_pred_lgreg)
acc= accuracy_score(y_test, y_pred_lgreg)
roc=roc_auc_score(y_test, y_pred_lgreg)
prec = precision_score(y_test, y_pred_lgreg)
rec = recall_score(y_test, y_pred_lgreg)
f1 = f1_score(y_test, y_pred_lgreg)

mathew = matthews_corrcoef(y_test, y_pred_lgreg)
model_results =pd.DataFrame([['Logistic Regression',acc, prec,rec,specificity, f1,roc, loss_log,mathew]],
               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score','ROC','Log_Loss','mathew_corrcoef'])

model_results

In [None]:
#ROC
def roc_auc_plot(y_true, y_proba, label=' ', l='-', lw=1.0):
    from sklearn.metrics import roc_curve, roc_auc_score
    fpr, tpr, _ = roc_curve(y_true, y_proba[:,1])
    ax.plot(fpr, tpr, linestyle=l, linewidth=lw,
            label="%s (area=%.3f)"%(label,roc_auc_score(y_true, y_proba[:,1])))

f, ax = plt.subplots(figsize=(12,8))


roc_auc_plot(y_test,decc.predict_proba(X_test),label='Decision Tree ',l='-')
roc_auc_plot(y_test,gb.predict_proba(X_test),label='Gaussian Naive Bayes ',l='-')
roc_auc_plot(y_test,svc.predict_proba(X_test),label='Linear SVM',l='-')
roc_auc_plot(y_test,lg_reg.predict_proba(X_test),label='Logistic Regression',l='-')
roc_auc_plot(y_test,rf_ent.predict_proba(X_test),label='Random Forest',l='-')

ax.plot([0,1], [0,1], color='k', linewidth=0.5, linestyle='--', 
        )    
ax.legend(loc="lower right")    
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
ax.set_title('Receiver Operator Characteristic curves')
sns.despine()