 Titanic - Machine Learning

Aim is to find out What sort of people were more likely to survive the Titanic sinking?

In [129]:
! pip install kaggle



In [130]:
# creating a directory kaggle in ~ home directory
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [131]:
#kaggle.json → The Kaggle API key file that we downloaded from your Kaggle account.
#copy kaggle.json API key file to ~/.kaggle file
!cp kaggle.json ~/.kaggle/

In [132]:
! chmod 600 ~/.kaggle/kaggle.json

In [133]:
! kaggle competitions download -c titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [134]:
from zipfile import ZipFile
dataset="/content/titanic.zip"
with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print("files extracted")

files extracted


In [135]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Load dataset

In [136]:
dt=pd.read_csv("/content/train.csv")

Data preprocessing

In [137]:
dt.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [138]:
# total entries are 891 and missing values in cabin are 687 so we can drop that column
# d = d.drop("Cabin",axis=1,inplace=True)

In [139]:
#or we can set  threshold value to 50% and if null values exceed 50% then it will drop that column using dropna()
threshold = 0.5 * len(dt)
dt.dropna(thresh=threshold, axis = 1,inplace=True)

In [140]:
# Age has 177 missing values so we will replace those values with mean values
dt['Age'] = dt['Age'].fillna(dt['Age'].median())

In [141]:
#Embarked has categorical data so we can replace missing values with mode values
dt['Embarked'].fillna(dt['Embarked'].mode()[0],inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dt['Embarked'].fillna(dt['Embarked'].mode()[0],inplace = True)


In [142]:
dt.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


Handling duplicate values

In [143]:
# check if passenger id's repeated if repeated remove those rows
dt.duplicated().sum()

0

encoding

In [144]:
#Label Encoding is done here
#male = 1 and female = 0
from sklearn.preprocessing import LabelEncoder
ls=LabelEncoder()
dt['Sex'] = ls.fit_transform(dt['Sex'])

In [145]:
#one hot encoding is done here
dt = pd.get_dummies(dt,columns=['Embarked'])

In [146]:
#Feature engineering : here we create a new feature Family_mem combining SibSp and Parch
dt['Family_mem'] = dt['SibSp']+dt['Parch']

In [147]:
dt.drop(['SibSp','Parch'],axis=1,inplace=True)

In [148]:
dt = dt.drop(['Name','Ticket'],axis = 1)

In [149]:
from sklearn.model_selection import train_test_split
X =dt.drop(['Survived'],axis=1)
Y = dt['Survived']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2, random_state=2,stratify=Y)

Model building

In [150]:
from sklearn.linear_model import LogisticRegression
ls = LogisticRegression()
ls.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [151]:
y_pred = ls.predict(X_test)

Evaluating model

In [152]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(Y_test,y_pred))

0.7932960893854749


In [153]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84       110
           1       0.79      0.64      0.70        69

    accuracy                           0.79       179
   macro avg       0.79      0.76      0.77       179
weighted avg       0.79      0.79      0.79       179



In [154]:
print(confusion_matrix(Y_test,y_pred))

[[98 12]
 [25 44]]


Hyperparameter Tuning(improve accuracy)

In [155]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [156]:
param_grid = {
    'n_estimators':  [50, 100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

# Print best parameters
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [157]:
y_pred = grid_search.predict(X_test)

In [158]:
X_test.shape

(179, 9)

In [159]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(Y_test,y_pred))

0.8324022346368715


Saving model

In [160]:
import pickle
with open('grid_search.pkl','wb') as file:
  pickle.dump(grid_search, file)

In [161]:
import pickle
with open('grid_search.pkl','rb') as file:
  grid_search = pickle.load(file)

Test data preprocessing

In [162]:
test_dt = pd.read_csv("/content/test.csv")

In [163]:
test_dt.shape

(418, 11)

In [164]:
test_dt.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


In [165]:
test_dt['Age'] = test_dt['Age'].fillna(test_dt['Age'].median())

In [166]:
test_dt['Fare'] = test_dt['Fare'].fillna(test_dt['Fare'].mean())

In [167]:
test_dt = test_dt.drop(['Name','Ticket','Cabin'],axis = 1)

In [168]:
test_dt['Family_mem'] = test_dt['SibSp']+test_dt['Parch']

In [169]:
test_dt.drop(['SibSp','Parch'],axis=1,inplace=True)

In [170]:
test_dt.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Sex,0
Age,0
Fare,0
Embarked,0
Family_mem,0


In [171]:
from sklearn.preprocessing import LabelEncoder
Le = LabelEncoder()
test_dt['Sex'] = Le.fit_transform(test_dt['Sex'])

In [172]:
test_dt = pd.get_dummies(test_dt,columns=['Embarked'])

In [173]:
test_dt.shape

(418, 9)

In [175]:
# Get the columns used during training
training_columns = X_train.columns

# Ensure test_dt has the same columns and in the same order
test_dt = test_dt.reindex(columns=training_columns, fill_value=0)

# Now you can predict using the modified test_dt
y_predict = grid_search.predict(test_dt)

creating results csv file

In [176]:
result_dt = pd.DataFrame({
    'ID' : test_dt['PassengerId'],
    'survived':y_predict
})

In [177]:
#save results to csv file
result_dt.to_csv('gender_submission2.csv',index=False)

In [178]:
z_test=pd.read_csv('/content/gender_submission.csv')

In [179]:
z_pred=pd.read_csv('/content/gender_submission2.csv')

In [183]:
accuracy = accuracy_score(z_test['Survived'],z_pred['survived'])

In [184]:
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 81.58%
