In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
models={
    'LR':LogisticRegression(),
    'KNN':KNeighborsClassifier(),
    'DT':DecisionTreeClassifier(),
    'SVC':SVC(),
    'NB':GaussianNB(),
    'XGC':XGBClassifier(),
    'RF':RandomForestClassifier() 
}

In [19]:
df = pd.read_csv("Assignment Datasets/Titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
df.shape

(891, 12)

In [21]:
df.drop(["PassengerId","Name","Ticket","Cabin"], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [23]:
df = pd.get_dummies(data=df, columns=["Sex","Embarked"], drop_first=True)
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         714 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   Sex_male    891 non-null    uint8  
 7   Embarked_Q  891 non-null    uint8  
 8   Embarked_S  891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 44.5 KB


In [25]:
df.isnull().sum()

Survived        0
Pclass          0
Age           177
SibSp           0
Parch           0
Fare            0
Sex_male        0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [28]:
imputer = KNNImputer()
df["Age"] = imputer.fit_transform(df[["Age"]])
df.isnull().sum()

Survived      0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_male      0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [30]:
df["Age"] = df["Age"].apply(lambda x : int(x))
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22,1,0,7.25,1,0,1
1,1,1,38,1,0,71.2833,0,0,0
2,1,3,26,0,0,7.925,0,0,1
3,1,1,35,1,0,53.1,0,0,1
4,0,3,35,0,0,8.05,1,0,1


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    int64  
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   Sex_male    891 non-null    uint8  
 7   Embarked_Q  891 non-null    uint8  
 8   Embarked_S  891 non-null    uint8  
dtypes: float64(1), int64(5), uint8(3)
memory usage: 44.5 KB


In [35]:
x = df.drop(["Survived"], axis=1)
y = df["Survived"]

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [37]:
y_train.value_counts()

0    436
1    276
Name: Survived, dtype: int64

In [38]:
smote = SMOTE()
x_train,y_train = smote.fit_sample(x_train, y_train)
y_train.value_counts()

1    436
0    436
Name: Survived, dtype: int64

In [39]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train

array([[-1.44617884,  2.01710686, -0.45835945, ...,  0.88913127,
        -0.30899415, -1.48522131],
       [-1.44617884,  1.48987608, -0.45835945, ..., -1.12469332,
        -0.30899415,  0.67330033],
       [-1.44617884,  0.13413981,  0.46899424, ..., -1.12469332,
        -0.30899415, -1.48522131],
       ...,
       [-1.44617884,  0.43541454, -0.45835945, ...,  0.88913127,
        -0.30899415,  0.67330033],
       [-0.26980948, -0.09181624, -0.45835945, ..., -1.12469332,
        -0.30899415,  0.67330033],
       [ 0.90655987, -0.39309097, -0.45835945, ...,  0.88913127,
        -0.30899415,  0.67330033]])

In [40]:
for name,model in  models.items():
    print(f'using {name}: ')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(f'Training Accuracy :{accuracy_score(y_train,model.predict(x_train))}')
    print(f'Testing Accuracy :{accuracy_score(y_test,y_pred)}')
    print(f'Recall: {recall_score(y_test,y_pred)}')
    print(f'precision: {precision_score(y_test,y_pred)}')
    print(f'F1-score: {f1_score(y_test,y_pred)}')
    print('-'*60)

using LR: 
Training Accuracy :0.7958715596330275
Testing Accuracy :0.8379888268156425
Recall: 0.8636363636363636
precision: 0.7402597402597403
F1-score: 0.7972027972027972
------------------------------------------------------------
using KNN: 
Training Accuracy :0.8589449541284404
Testing Accuracy :0.8379888268156425
Recall: 0.8787878787878788
precision: 0.7341772151898734
F1-score: 0.8
------------------------------------------------------------
using DT: 
Training Accuracy :0.9805045871559633
Testing Accuracy :0.8659217877094972
Recall: 0.8181818181818182
precision: 0.8181818181818182
F1-score: 0.8181818181818182
------------------------------------------------------------
using SVC: 
Training Accuracy :0.8394495412844036
Testing Accuracy :0.8547486033519553
Recall: 0.7727272727272727
precision: 0.8225806451612904
F1-score: 0.796875
------------------------------------------------------------
using NB: 
Training Accuracy :0.7958715596330275
Testing Accuracy :0.8156424581005587
Recal



Training Accuracy :0.9655963302752294
Testing Accuracy :0.88268156424581
Recall: 0.8181818181818182
precision: 0.8571428571428571
F1-score: 0.8372093023255814
------------------------------------------------------------
using RF: 
Training Accuracy :0.9805045871559633
Testing Accuracy :0.8770949720670391
Recall: 0.8484848484848485
precision: 0.8235294117647058
F1-score: 0.8358208955223881
------------------------------------------------------------
