* This file is finding the best classification model for the titanic dataset
* imported dataset from seaborn
* did some data preprocessing
* trained listed models which were evaluated on accuracy score:
    * Logistic Regression: 0.80
    * Random Forest: 0.78
    * Gradient Booster: 0.82
    * SVM: 0.65
    * KNN: 0.69
    * Decision Tree: 0.75
    * Naive Bayes: 0.71

* thus best model is Gradient Booster


In [1]:
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic', index_col=False)

In [2]:
titanic.isna().sum() # age=177, embarked=2, embark_town=2, deck=688
titanic = titanic.drop(['class','deck','embark_town','adult_male','who','alive','alone'], axis=1)

In [3]:
# sex, pclass, embarked is categorical 
titanic['sex'] = titanic['sex'].map({'male':0,'female':1})
titanic = pd.get_dummies(titanic, columns=['embarked'], drop_first=True, dtype=int)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S
0,0,3,0,22.0,1,0,7.25,0,1
1,1,1,1,38.0,1,0,71.2833,0,0
2,1,3,1,26.0,0,0,7.925,0,1
3,1,1,1,35.0,1,0,53.1,0,1
4,0,3,0,35.0,0,0,8.05,0,1


In [23]:
print(titanic.describe())
print(titanic.fare.min())

         survived      pclass         sex  ...        fare  embarked_Q  embarked_S
count  891.000000  891.000000  891.000000  ...  891.000000  891.000000  891.000000
mean     0.383838    2.308642    0.352413  ...   32.204208    0.086420    0.722783
std      0.486592    0.836071    0.477990  ...   49.693429    0.281141    0.447876
min      0.000000    1.000000    0.000000  ...    0.000000    0.000000    0.000000
25%      0.000000    2.000000    0.000000  ...    7.910400    0.000000    0.000000
50%      0.000000    3.000000    0.000000  ...   14.454200    0.000000    1.000000
75%      1.000000    3.000000    1.000000  ...   31.000000    0.000000    1.000000
max      1.000000    3.000000    1.000000  ...  512.329200    1.000000    1.000000

[8 rows x 9 columns]
0.0


In [4]:
median_age = titanic['age'].median()
titanic['age'].fillna(median_age, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['age'].fillna(median_age, inplace=True)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
y=titanic['survived']
X=titanic.drop('survived',axis=1)

train_X, test_X, train_y, test_y = train_test_split(X,y,random_state=42)

In [25]:
trial = [1,0,55,2,6,56,0,1]
train_X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S
298,1,0,28.0,0,0,30.5,0,1
884,3,0,25.0,0,0,7.05,0,1
247,2,1,24.0,0,2,14.5,0,1
478,3,0,22.0,0,0,7.5208,0,1
305,1,0,0.92,1,2,151.55,0,1


In [6]:
from sklearn.linear_model import LogisticRegression

logi_model=LogisticRegression()
logi_model.fit(train_X, train_y)
logi_pred = logi_model.predict(test_X)
print(accuracy_score(logi_pred, test_y))

0.8071748878923767


In [30]:
import numpy as np

trial_np = np.array([trial])
logi_model.predict(trial_np)



array([0])

In [7]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier()
forest_model.fit(train_X, train_y)
forest_pred = forest_model.predict(test_X)
print(accuracy_score(forest_pred, test_y))

0.7802690582959642


In [8]:
from sklearn.ensemble import GradientBoostingClassifier

GradBoost_model = GradientBoostingClassifier()
GradBoost_model.fit(train_X, train_y)
GradBoost_pred = GradBoost_model.predict(test_X)
print(accuracy_score(GradBoost_pred, test_y))

0.820627802690583


In [9]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(train_X,train_y)
svm_pred = svm_model.predict(test_X)
print(accuracy_score(svm_pred, test_y))

0.6591928251121076


In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(train_X, train_y)
knn_pred = knn_model.predict(test_X)
print(accuracy_score(knn_pred, test_y))

0.695067264573991


In [11]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(train_X, train_y)
tree_pred = tree_model.predict(test_X)
print(accuracy_score(tree_pred, test_y))

0.757847533632287


In [12]:
from sklearn.naive_bayes import MultinomialNB

NB_model = MultinomialNB()
NB_model.fit(train_X, train_y)
NB_pred = NB_model.predict(test_X)
print(accuracy_score(NB_pred, test_y))

0.7174887892376681


In [19]:
import pickle

with open('best_model.pkl', 'wb') as file:
    pickle.dump(GradBoost_model, file)