<h3> Import libraries </h3>

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

<h3> Data collection and processing </h3>

In [47]:
trainDataset = pd.read_csv('train.csv')

In [48]:
trainDataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [49]:
trainDataset.shape

(891, 12)

In [50]:
trainDataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [51]:
trainDataset['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [52]:
trainDataset['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [60]:
trainDataset['Embarked'].value_counts()

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [53]:
trainDataset['SibSp'].value_counts()

SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64

In [54]:
trainDataset = trainDataset.drop(columns=['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1)

In [55]:
trainDataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [56]:
trainDataset['Age'] = trainDataset['Age'].fillna(trainDataset['Age'].mean())

In [57]:
trainDataset['Embarked'].mode()

0    S
Name: Embarked, dtype: object

In [58]:
trainDataset['Embarked'] = trainDataset['Embarked'].fillna(trainDataset['Embarked'].mode()[0])

In [59]:
trainDataset.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [61]:
trainDataset.replace({'Sex':{'male':1, 'female':0}, 'Embarked': {'S':0, 'C':1, 'Q':2}}, inplace=True)

In [62]:
trainDataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,0
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.925,0
3,1,1,0,35.0,1,0,53.1,0
4,0,3,1,35.0,0,0,8.05,0


<h3> Splitting features and targets </h3>

In [64]:
X = trainDataset.drop(columns='Survived', axis=1)
y = trainDataset['Survived']
print(X)
print(y)

     Pclass  Sex        Age  SibSp  Parch     Fare  Embarked
0         3    1  22.000000      1      0   7.2500         0
1         1    0  38.000000      1      0  71.2833         1
2         3    0  26.000000      0      0   7.9250         0
3         1    0  35.000000      1      0  53.1000         0
4         3    1  35.000000      0      0   8.0500         0
..      ...  ...        ...    ...    ...      ...       ...
886       2    1  27.000000      0      0  13.0000         0
887       1    0  19.000000      0      0  30.0000         0
888       3    0  29.699118      1      2  23.4500         0
889       1    1  26.000000      0      0  30.0000         1
890       3    1  32.000000      0      0   7.7500         2

[891 rows x 7 columns]
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


<h3> Standardisation </h3>

In [65]:
scalar = StandardScaler()
scalar.fit(X)
X = scalar.transform(X)
print(X)

[[ 0.82737724  0.73769513 -0.5924806  ... -0.47367361 -0.50244517
  -0.56883712]
 [-1.56610693 -1.35557354  0.63878901 ... -0.47367361  0.78684529
   1.00518113]
 [ 0.82737724 -1.35557354 -0.2846632  ... -0.47367361 -0.48885426
  -0.56883712]
 ...
 [ 0.82737724 -1.35557354  0.         ...  2.00893337 -0.17626324
  -0.56883712]
 [-1.56610693  0.73769513 -0.2846632  ... -0.47367361 -0.04438104
   1.00518113]
 [ 0.82737724  0.73769513  0.17706291 ... -0.47367361 -0.49237783
   2.57919938]]


<h3> Train Test Split </h3>

In [68]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [69]:
print(X.shape, Xtrain.shape, Xtest.shape)

(891, 7) (712, 7) (179, 7)


<h3> Model Training </h3>

In [70]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)

<h3> Model Evaluation </h3>

In [71]:
trp = model.predict(Xtrain)
tra = accuracy_score(trp, Ytrain)
print(tra)

0.8033707865168539


In [72]:
tp = model.predict(Xtest)
ta = accuracy_score(tp, Ytest)
print(ta)

0.7877094972067039
