In [1]:
import pandas as pd

titanic_data = pd.read_csv("datasets/train.csv")

In [2]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
drop_columns = ["PassengerId", "Name", "Ticket"]

titanic_data.drop(drop_columns, axis=1, inplace=True)

titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [4]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [5]:
# Cabin has a lot of null values, so it's better to drop it
titanic_data.drop(["Cabin"], axis=1, inplace=True)
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [6]:
# The age has a quite few null values, let's replace them with the average age.
titanic_data["Age"] = titanic_data["Age"].fillna(titanic_data["Age"].mean())
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [7]:
# Embarked columns has two null values, we can drop them
titanic_data = titanic_data.dropna()
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


In [8]:
# We have 2 object columns, let's handle that
titanic_data["Embarked"].unique()

array(['S', 'C', 'Q'], dtype=object)

In [9]:
titanic_data["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [10]:
# Use label encoding because it's just 3 values
titanic_data["Embarked"] = titanic_data["Embarked"].map({
    'S': 0,
    'C': 1,
    'Q': 2
})

titanic_data["Embarked"].value_counts()

Embarked
0    644
1    168
2     77
Name: count, dtype: int64

In [11]:
titanic_data["Sex"].unique()

array(['male', 'female'], dtype=object)

In [12]:
titanic_data["Sex"].value_counts()

Sex
male      577
female    312
Name: count, dtype: int64

In [13]:
# Use label encoding because it's just 2 values
titanic_data["Sex"] = titanic_data["Sex"].map({
    'male': 0,
    'female': 1,
})

titanic_data["Sex"].value_counts()

Sex
0    577
1    312
Name: count, dtype: int64

In [14]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    int64  
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 62.5 KB


In [15]:
# Now that we have only numeric data, let's check some statistics
titanic_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,2.311586,0.350956,29.653446,0.524184,0.382452,32.096681,0.362205
std,0.48626,0.8347,0.477538,12.968366,1.103705,0.806761,49.697504,0.636157
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.8958,0.0
50%,0.0,3.0,0.0,29.699118,0.0,0.0,14.4542,0.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [16]:
titanic_data.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.335549,0.541585,-0.074673,-0.03404,0.083151,0.25529,0.108669
Pclass,-0.335549,1.0,-0.127741,-0.327954,0.081656,0.016824,-0.548193,0.043835
Sex,0.541585,-0.127741,1.0,-0.089434,0.116348,0.247508,0.179958,0.118593
Age,-0.074673,-0.327954,-0.089434,1.0,-0.231875,-0.178232,0.088604,0.009499
SibSp,-0.03404,0.081656,0.116348,-0.231875,1.0,0.414542,0.160887,-0.060606
Parch,0.083151,0.016824,0.247508,-0.178232,0.414542,1.0,0.217532,-0.07932
Fare,0.25529,-0.548193,0.179958,0.088604,0.160887,0.217532,1.0,0.063462
Embarked,0.108669,0.043835,0.118593,0.009499,-0.060606,-0.07932,0.063462,1.0


In [17]:
X = titanic_data.drop(["Survived"], axis=1)
Y = titanic_data["Survived"]

print(X.head())
print(Y.head())

   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0       3    0  22.0      1      0   7.2500         0
1       1    1  38.0      1      0  71.2833         1
2       3    1  26.0      0      0   7.9250         0
3       1    1  35.0      1      0  53.1000         0
4       3    0  35.0      0      0   8.0500         0
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"X_train size: {len(X_train)}")
print(f"X_test size: {len(X_test)}")
print(f"Y_train size: {len(Y_train)}")
print(f"Y_test size: {len(Y_test)}")

X_train size: 711
X_test size: 178
Y_train size: 711
Y_test size: 178


In [19]:
from sklearn.metrics import accuracy_score

In [21]:
from sklearn.linear_model import LogisticRegression

def logistic_model(X_train, Y_train, X_test, Y_test):
    logreg = LogisticRegression()
    logreg.fit(X_train, Y_train)
    Y_pred_logreg = logreg.predict(X_test)
    logreg_accuracy = accuracy_score(Y_test, Y_pred_logreg)

    return logreg_accuracy

In [22]:
from sklearn.ensemble import RandomForestClassifier
def random_forest_model(X_train, Y_train, X_test, Y_test, n_estimators, max_depth):
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    rf.fit(X_train, Y_train)
    Y_pred_rf = rf.predict(X_test)
    rf_accuracy = accuracy_score(Y_test, Y_pred_rf)

    return rf_accuracy

In [30]:
from sklearn.neighbors import KNeighborsClassifier
def knn_model(X_train, Y_train, X_test, Y_test):
    knn = KNeighborsClassifier()
    knn.fit(X_train, Y_train)
    Y_pred_knn = knn.predict(X_test)
    knn_accuracy = accuracy_score(Y_test, Y_pred_knn)

    return knn_accuracy

In [32]:
logreg_accuracy = logistic_model(X_train, Y_train, X_test, Y_test)
print(f"Logistic Rregression Accuracy: {logreg_accuracy:.4f}")

Logistic Rregression Accuracy: 0.7865


In [40]:
# Test different hyperparameters in Random Forest
rf_accuracy = random_forest_model(X_train, Y_train, X_test, Y_test, 200, 10)
print(f'Random Forest Accuracy (200, 10): {rf_accuracy:.4f}')
rf_accuracy = random_forest_model(X_train, Y_train, X_test, Y_test, 300, 10)
print(f'Random Forest Accuracy (300, 10): {rf_accuracy:.4f}')
rf_accuracy = random_forest_model(X_train, Y_train, X_test, Y_test, 200, 5)
print(f'Random Forest Accuracy (200, 5): {rf_accuracy:.4f}')
rf_accuracy = random_forest_model(X_train, Y_train, X_test, Y_test, 300, 5)
print(f'Random Forest Accuracy (300, 5): {rf_accuracy:.4f}')

Random Forest Accuracy (200, 10): 0.7978
Random Forest Accuracy (300, 10): 0.7921
Random Forest Accuracy (200, 5): 0.8146
Random Forest Accuracy (300, 5): 0.8034


In [41]:
rf_accuracy = random_forest_model(X_train, Y_train, X_test, Y_test, 200, 5)
print(f'Random Forest Accuracy (200, 5): {rf_accuracy:.4f}')

Random Forest Accuracy (200, 5): 0.8146


In [34]:
knn_accuracy = knn_model(X_train, Y_train, X_test, Y_test)
print(f'KNN Accuracy: {knn_accuracy:.4f}')

KNN Accuracy: 0.7022


In [36]:
# Accuracy before normalization

print(f'Logistic Regression Accuracy: {logreg_accuracy:.4f}')
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')
print(f'KNN Accuracy: {knn_accuracy:.4f}')

Logistic Regression Accuracy: 0.7865
Random Forest Accuracy: 0.8146
KNN Accuracy: 0.7022


In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [48]:
logreg_accuracy_norm = logistic_model(X_train_norm, Y_train, X_test_norm, Y_test)
print(f"Logistic Rregression Normalized Accuracy: {logreg_accuracy_norm:.4f}")

Logistic Rregression Normalized Accuracy: 0.7753


In [49]:
# Test different hyperparameters in Random Forest
rf_accuracy_norm = random_forest_model(X_train_norm, Y_train, X_test_norm, Y_test, 200, 10)
print(f'Random Forest Normalized Accuracy (200, 10): {rf_accuracy_norm:.4f}')
rf_accuracy_norm = random_forest_model(X_train_norm, Y_train, X_test_norm, Y_test, 300, 10)
print(f'Random Forest Normalized Accuracy (300, 10): {rf_accuracy_norm:.4f}')
rf_accuracy_norm = random_forest_model(X_train_norm, Y_train, X_test_norm, Y_test, 200, 5)
print(f'Random Forest Normalized Accuracy (200, 5): {rf_accuracy_norm:.4f}')
rf_accuracy_norm = random_forest_model(X_train_norm, Y_train, X_test_norm, Y_test, 300, 5)
print(f'Random Forest Normalized Accuracy (300, 5): {rf_accuracy_norm:.4f}')

Random Forest Normalized Accuracy (200, 10): 0.7978
Random Forest Normalized Accuracy (300, 10): 0.7921
Random Forest Normalized Accuracy (200, 5): 0.8146
Random Forest Normalized Accuracy (300, 5): 0.8034


In [50]:
rf_accuracy_norm = random_forest_model(X_train_norm, Y_train, X_test_norm, Y_test, 200, 5)
print(f'Random Forest Normalized Accuracy (200, 5): {rf_accuracy_norm:.4f}')

Random Forest Normalized Accuracy (200, 5): 0.8146


In [51]:
knn_accuracy_norm = knn_model(X_train_norm, Y_train, X_test_norm, Y_test)
print(f'KNN Normalized Accuracy: {knn_accuracy_norm:.4f}')

KNN Normalized Accuracy: 0.7865


In [52]:
# Accuracy after normalization

print(f'Logistic Regression Normalized Accuracy: {logreg_accuracy_norm:.4f}')
print(f'Random Forest Normalized Accuracy: {rf_accuracy_norm:.4f}')
print(f'KNN Accuracy Normalized: {knn_accuracy_norm:.4f}')

Logistic Regression Normalized Accuracy: 0.7753
Random Forest Normalized Accuracy: 0.8146
KNN Accuracy Normalized: 0.7865
