<h2>Titanic - Machine Learning from Disaster</h2>

<h3>Importing libraries</h3>

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

<h3>Loading dataset</h3>

In [2]:
titanic_df = pd.read_csv("./data/train.csv")
print("Number of rows:", len(titanic_df))

Number of rows: 891


<h3>Dropping columns</h3>

In [3]:
titanic_df = titanic_df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
print(titanic_df)

     Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0           0       3    male  22.0      1      0   7.2500        S
1           1       1  female  38.0      1      0  71.2833        C
2           1       3  female  26.0      0      0   7.9250        S
3           1       1  female  35.0      1      0  53.1000        S
4           0       3    male  35.0      0      0   8.0500        S
..        ...     ...     ...   ...    ...    ...      ...      ...
886         0       2    male  27.0      0      0  13.0000        S
887         1       1  female  19.0      0      0  30.0000        S
888         0       3  female   NaN      1      2  23.4500        S
889         1       1    male  26.0      0      0  30.0000        C
890         0       3    male  32.0      0      0   7.7500        Q

[891 rows x 8 columns]


In [4]:
titanic_df = pd.get_dummies(titanic_df, columns=["Sex", "Embarked"], dtype=np.int_)

# Avoid dummy trap
titanic_df = titanic_df.drop(columns=["Sex_male", "Embarked_S"]) 
print(titanic_df)

     Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Embarked_C  \
0           0       3  22.0      1      0   7.2500           0           0   
1           1       1  38.0      1      0  71.2833           1           1   
2           1       3  26.0      0      0   7.9250           1           0   
3           1       1  35.0      1      0  53.1000           1           0   
4           0       3  35.0      0      0   8.0500           0           0   
..        ...     ...   ...    ...    ...      ...         ...         ...   
886         0       2  27.0      0      0  13.0000           0           0   
887         1       1  19.0      0      0  30.0000           1           0   
888         0       3   NaN      1      2  23.4500           1           0   
889         1       1  26.0      0      0  30.0000           0           1   
890         0       3  32.0      0      0   7.7500           0           0   

     Embarked_Q  
0             0  
1             0  
2        

<h3>Handling the NaNs</h3>

In [5]:
print("NaN Count:")
print(titanic_df.isna().sum())

NaN Count:
Survived        0
Pclass          0
Age           177
SibSp           0
Parch           0
Fare            0
Sex_female      0
Embarked_C      0
Embarked_Q      0
dtype: int64


<h4>Taking care of the Age Column</h4>

In [6]:
from sklearn.impute import SimpleImputer

age_imputer = SimpleImputer()
titanic_df["Age"] = age_imputer.fit_transform(titanic_df["Age"].values.reshape(-1, 1))
print(titanic_df.isna().sum())

Survived      0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_female    0
Embarked_C    0
Embarked_Q    0
dtype: int64


<h3>Splitting data into training & testing datasets</h3>

In [7]:
X = np.array(titanic_df.iloc[:, 1:].values)
y = np.array(titanic_df.iloc[:, 0].values)

In [8]:
print(X[0])

[ 3.   22.    1.    0.    7.25  0.    0.    0.  ]


In [9]:
print(y[0])

0


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [11]:
print(X_train[0])

[ 1.  22.   1.   0.  66.6  1.   0.   0. ]


In [12]:
print(X_test[0])


[ 3.         29.69911765  0.          0.          7.7375      1.
  0.          1.        ]


In [13]:
print(y_train[0])


1


In [14]:
print(y_test[0])


1


<h3>Training the models</h3>

In [15]:
models = []

<h4>Logistic Regression</h4>

In [16]:
from sklearn.linear_model import LogisticRegression

models.append(LogisticRegression(max_iter=1000))

<h4>K-Nearest Neighbors</h4>

In [17]:
from sklearn.neighbors import KNeighborsClassifier

models.append(KNeighborsClassifier())

<h4>Support Vector Classifier</h4>

In [18]:
from sklearn.svm import SVC

models.append(SVC())

<h4>Naive Bayes</h4>

In [19]:
from sklearn.naive_bayes import GaussianNB

models.append(GaussianNB())

<h4>Decision Tree Classifier</h4>

In [20]:
from sklearn.tree import DecisionTreeClassifier

models.append(DecisionTreeClassifier())

<h4>Random Forest Classifier</h4>

In [21]:
from sklearn.ensemble import RandomForestClassifier

models.append(RandomForestClassifier())

<h3>Evaluating the models<h3>

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(type(model))
    print("Accuracy score:", accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print("\n")

<class 'sklearn.linear_model._logistic.LogisticRegression'>
Accuracy score: 0.7486033519553073
[[93 22]
 [23 41]]


<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Accuracy score: 0.7262569832402235
[[93 22]
 [27 37]]


<class 'sklearn.svm._classes.SVC'>
Accuracy score: 0.7318435754189944
[[106   9]
 [ 39  25]]


<class 'sklearn.naive_bayes.GaussianNB'>
Accuracy score: 0.7988826815642458
[[98 17]
 [19 45]]


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Accuracy score: 0.7877094972067039
[[96 19]
 [19 45]]


<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Accuracy score: 0.7821229050279329
[[93 22]
 [17 47]]


