Model That Predicts the Survivial of Passengers in the Titanic Dataset

In [1]:
# Import Necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Loading the Dataset
titanic_df = pd.read_csv('train.csv')

In [3]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# CHecking for missing data
titanic_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# Filling the missing data in the 'Age' column with the mean age
titanic_df['Age'] = titanic_df['Age'].fillna(titanic_df['Age'].mean())

In [8]:
titanic_df['Age'].isnull().sum()

0

In [9]:
# Deleting the 'Cabin' column
titanic_df = titanic_df.drop('Cabin', axis =1 )

In [10]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [11]:
# Converting the 'Sex' column into dummies variable i.e 'male' = 1 and 'female' = 0
titanic_df['Sex'] = np.where(titanic_df['Sex'] == 'male', 1, 0)

In [12]:
titanic_df['Sex'].unique()

array([1, 0])

In [13]:
titanic_df['Parch'].unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

In [14]:
titanic_df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [15]:
# Formatting the 'Embarked' columns where 'C' = 1, 'Q' = 2, 'S' = 3 and the missing vairables to 0
titanic_df['Embarked'] = np.where(titanic_df['Embarked'] == 'C', 1, titanic_df['Embarked'])
titanic_df['Embarked'] = np.where(titanic_df['Embarked'] == 'Q', 2, titanic_df['Embarked'])
titanic_df['Embarked'] = np.where(titanic_df['Embarked'] == 'S', 3, titanic_df['Embarked'])

In [16]:
titanic_df['Embarked'] = titanic_df['Embarked'].fillna(0)

  titanic_df['Embarked'] = titanic_df['Embarked'].fillna(0)


In [17]:
titanic_df['Embarked'].unique()

array([3, 1, 2, 0], dtype=int64)

In [18]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,3
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,3


In [19]:
# Standardizing the 'Age' and 'Fare' columns
scaler = MinMaxScaler()

In [20]:
titanic_df[['Age', 'Fare']] = scaler.fit_transform(titanic_df[['Age', 'Fare']])

In [21]:
titanic_df[['Age', 'Fare']]

Unnamed: 0,Age,Fare
0,0.271174,0.014151
1,0.472229,0.139136
2,0.321438,0.015469
3,0.434531,0.103644
4,0.434531,0.015713
...,...,...
886,0.334004,0.025374
887,0.233476,0.058556
888,0.367921,0.045771
889,0.321438,0.058556


In [22]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,0.271174,1,0,A/5 21171,0.014151,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.472229,1,0,PC 17599,0.139136,1
2,3,1,3,"Heikkinen, Miss. Laina",0,0.321438,0,0,STON/O2. 3101282,0.015469,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.434531,1,0,113803,0.103644,3
4,5,0,3,"Allen, Mr. William Henry",1,0.434531,0,0,373450,0.015713,3


In [23]:
# Droping all other none numeric column
titanic_model_df = titanic_df.drop(['Name', 'Ticket'], axis = 1)

In [24]:
titanic_model_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,0.271174,1,0,0.014151,3
1,2,1,1,0,0.472229,1,0,0.139136,1
2,3,1,3,0,0.321438,0,0,0.015469,3
3,4,1,1,0,0.434531,1,0,0.103644,3
4,5,0,3,1,0.434531,0,0,0.015713,3


In [25]:
# Spliting the dataset into train and test data
train_data, test_data = train_test_split(titanic_model_df, test_size=0.3, random_state=42)

In [26]:
x_train = train_data.drop('Survived', axis = 1)

In [27]:
y_train = train_data['Survived']

In [28]:
x_test = test_data.drop('Survived', axis = 1)

In [29]:
y_test = test_data['Survived']

In [30]:
# Creating and class instance for our LogisticRegression model
logreg = LogisticRegression()

In [31]:
# Training the model
logreg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
# Creating prediction using the model
prediction = logreg.predict(x_test)

Testing the Model

In [33]:
print("Classification-report: \n ", classification_report(y_test, prediction))

Classification-report: 
                precision    recall  f1-score   support

           0       0.82      0.83      0.82       157
           1       0.75      0.74      0.75       111

    accuracy                           0.79       268
   macro avg       0.78      0.78      0.78       268
weighted avg       0.79      0.79      0.79       268



In [34]:
print("Confusion-matrix: \n ", confusion_matrix(y_test, prediction))

Confusion-matrix: 
  [[130  27]
 [ 29  82]]


In [35]:
print("Accuracy-Score: \n", accuracy_score(y_test, prediction))

Accuracy-Score: 
 0.7910447761194029


Importing a  New Test Dataset Where The Survivial Column is Missing And Using The Model To Predict The Survivial Column

In [36]:
titanic_test_df = pd.read_csv('test.csv')

In [37]:
titanic_test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Preprocessing the New Test Dataset

In [38]:
titanic_test_model_df = titanic_test_df.drop(['Name', 'Ticket', 'Cabin'], axis = 1)

In [39]:
titanic_test_model_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [40]:
titanic_test_model_df['Sex'] = np.where(titanic_test_model_df['Sex'] == 'male', 1, 0)

In [41]:
titanic_test_model_df['Sex'].unique()

array([1, 0])

In [42]:
titanic_test_model_df.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [43]:
titanic_test_model_df['Age'] = titanic_test_model_df['Age'].fillna(titanic_test_model_df['Age'].mean())

In [44]:
titanic_test_model_df.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64

In [45]:
titanic_test_model_df['Fare'] = titanic_test_model_df['Fare'].fillna(titanic_test_model_df['Fare'].max())

In [46]:
titanic_test_model_df.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [47]:
titanic_test_model_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,Q
1,893,3,0,47.0,1,0,7.0,S
2,894,2,1,62.0,0,0,9.6875,Q
3,895,3,1,27.0,0,0,8.6625,S
4,896,3,0,22.0,1,1,12.2875,S


In [48]:
titanic_test_model_df['Embarked'] = np.where(titanic_test_model_df['Embarked'] == 'C', 1, titanic_test_model_df['Embarked'])
titanic_test_model_df['Embarked'] = np.where(titanic_test_model_df['Embarked'] == 'Q', 2, titanic_test_model_df['Embarked'])
titanic_test_model_df['Embarked'] = np.where(titanic_test_model_df['Embarked'] == 'S', 3, titanic_test_model_df['Embarked'])

In [49]:
titanic_test_model_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,2
1,893,3,0,47.0,1,0,7.0,3
2,894,2,1,62.0,0,0,9.6875,2
3,895,3,1,27.0,0,0,8.6625,3
4,896,3,0,22.0,1,1,12.2875,3


In [50]:
titanic_test_model_df[['Age', 'Fare']] = scaler.fit_transform(titanic_test_model_df[['Age', 'Fare']])

In [51]:
titanic_test_model_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,0.452723,0,0,0.015282,2
1,893,3,0,0.617566,1,0,0.013663,3
2,894,2,1,0.815377,0,0,0.018909,2
3,895,3,1,0.353818,0,0,0.016908,3
4,896,3,0,0.287881,1,1,0.023984,3


Predicting The Survivial Column

In [52]:
Survived = logreg.predict(titanic_test_model_df)

In [53]:
Survived

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [54]:
PassengerId = titanic_test_df['PassengerId']

In [55]:
Survived = pd.Series(Survived)

In [56]:
Survived

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Length: 418, dtype: int64

In [57]:
PassengerId = pd.Series(PassengerId)

In [58]:
PassengerId

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [59]:
Survived.shape

(418,)

In [60]:
PassengerId.shape

(418,)

In [61]:
predicted_df = pd.DataFrame({'PassengerId': PassengerId, 'Survivial': Survived} )

In [62]:
predicted_df

Unnamed: 0,PassengerId,Survivial
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
