# Predict Titanic Survival


In this project, Logistic Regression model is used to predict which passengers survived the sinking of the Titanic, based on their sex, age, and class.

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

## 1. Load the Data

In [27]:
# Load the passenger data
passengers = pd.read_csv("passengers.csv")
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
print(passengers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


## 2. Clean the Data

In [29]:
# Update sex column to numerical
passengers['Sex'] = passengers['Sex'].apply(lambda x: 1 if x == 'female' else 0)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [30]:
# Fill the nan values in the age column
print(sum(pd.isna(passengers['Age'])))
passengers.fillna(passengers['Age'].mean(), inplace = True)
print(sum(pd.isna(passengers['Age'])))

177
0


## 3. Select and Split the Data

In [31]:
# Create a first class column
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,29.6991,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,29.6991,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,29.6991,S,0


In [32]:
# Create a second class column
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,29.6991,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,29.6991,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,29.6991,S,0,0


In [33]:
# Select the desired features
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers[['Survived']]

# Perform train, test, split
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size = 0.8, random_state = 1314)

## 4. Normalize the Data

In [34]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## 5. Create and Evaluate the Model

In [35]:
# Create and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Score the model on the train data
score_train = model.score(X_train, y_train)
print(score_train)

# Score the model on the test data
score_test = model.score(X_test, y_test)
print(score_test)

# Analyze the coefficients
print(list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0])))

0.8146067415730337
0.7798036465638148
[('Sex', 1.4543839141740265), ('Age', -0.12435876534238305), ('FirstClass', 0.8269874621661778), ('SecondClass', 0.4558759089913142)]


Sex and age are teh most important features in predicting survival on the sinking of the Titanic.

## 6. Predict with the Model

In [36]:
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Someone = np.array([0.0,25,0.0,2.0])

# Combine passenger arrays
sample_passengers = np.array([Jack, Rose, Someone])

# Scale the sample passenger features
sample_passengers = scaler.fit_transform(sample_passengers)
print(sample_passengers)

# Make survival predictions!
predictions = model.predict(sample_passengers)
print(predictions)

predic_proba = model.predict_proba(sample_passengers)
print(predic_proba)

[[-0.70710678 -0.20203051 -0.70710678 -0.70710678]
 [ 1.41421356 -1.1111678   1.41421356 -0.70710678]
 [-0.70710678  1.31319831 -0.70710678  1.41421356]]
[0 1 0]
[[0.92087625 0.07912375]
 [0.07597982 0.92402018]
 [0.8423324  0.1576676 ]]


Only Rose survived the sinking of Titanic.