In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [4]:
# Load passenger data
passengers=pd.read_csv(r'titanic.csv')
print(passengers.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [11]:
#Tidy the data
# Update sex column to numerical
#passengers["Sex"].replace({"male": 0, "female": 1}, inplace=True)
print(passengers.Sex)
# Fill the nan values in the age column
passengers["Age"].fillna(value=np.mean(passengers.Age), inplace=True)
print(passengers.Age)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: int64
0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64


In [13]:
#Add new class columns to perform predictions based on passenger class
# Add a first class column
passengers['FirstClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)

# Add a second class column
passengers['SecondClass'] = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)
print(passengers.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex        Age  SibSp  \
0                            Braund, Mr. Owen Harris    0  22.000000      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.000000      1   
2                             Heikkinen, Miss. Laina    1  26.000000      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.000000      1   
4                           Allen, Mr. William Henry    0  35.000000      0   
5                                   Moran, Mr. James    0  29.699118      0   
6                            McCarthy, Mr. Timothy

In [14]:
# Select the desired columns/ features for prediction
features=passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival=passengers.Survived
# split the data for training / testing
train_x, test_x, train_y, test_y = train_test_split(features,survival,test_size = 0.2)
# Scale the data so it has mean of 0 and standard deviation of 1
scaler = StandardScaler()
scale_ftrain=scaler.fit_transform(train_x)
scale_ftest=scaler.fit_transform(test_x)

In [15]:
# Create and train the model for prediction
model = LogisticRegression()
model.fit(train_x, train_y)
# Score the model on the train data
print(model.score(train_x, train_y))

0.7907303370786517


In [16]:
# Score the model on the test data
print(model.score(test_x, test_y))
# Analyze the coefficients
print(model.coef_)
print(list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0])))

0.8212290502793296
[[ 2.39091213 -0.02743271  2.00966599  1.07580935]]
[('Sex', 2.390912134466147), ('Age', -0.027432705837715076), ('FirstClass', 2.0096659859787493), ('SecondClass', 1.0758093470660213)]


In [17]:
# Create sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
Belle = np.array([1.0,43.0,0.0,1.0])
# Combine passenger arrays
sample_passengers = np.array([Jack, Rose, Belle])
# Scale the sample passenger features
sample_passengers=scaler.transform(sample_passengers)
print(sample_passengers)


[[-0.83950099 -0.72815248 -0.59670814 -0.50174521]
 [ 1.19118383 -0.95535255  1.67586116 -0.50174521]
 [ 1.19118383  1.01371477 -0.59670814  1.99304346]]


In [18]:
# Predict Survival
print(model.predict(sample_passengers))
print(model.predict_proba(sample_passengers))

[0 1 1]
[[0.99427318 0.00572682]
 [0.01376623 0.98623377]
 [0.08830814 0.91169186]]
