In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:

# Load the passenger data
passengers = pd.read_csv("passengers.csv")
print(passengers.head())

# Update sex column to numerical
passengers["Sex"] = passengers["Sex"].map({
  "female": 1,
  "male": 0
})
print(passengers.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [3]:


passengers['Age'].fillna(value=int(passengers['Age'].mean()),inplace=True)
print(passengers['Age'].values)



[22.   38.   26.   35.   35.   29.   54.    2.   27.   14.    4.   58.
 20.   39.   14.   55.    2.   29.   31.   29.   35.   34.   15.   28.
  8.   38.   29.   19.   29.   29.   40.   29.   29.   66.   28.   42.
 29.   21.   18.   14.   40.   27.   29.    3.   19.   29.   29.   29.
 29.   18.    7.   21.   49.   29.   65.   29.   21.   28.5   5.   11.
 22.   38.   45.    4.   29.   29.   29.   19.   17.   26.   32.   16.
 21.   26.   32.   25.   29.   29.    0.83 30.   22.   29.   29.   28.
 17.   33.   16.   29.   23.   24.   29.   20.   46.   26.   59.   29.
 71.   23.   34.   34.   28.   29.   21.   33.   37.   28.   21.   29.
 38.   29.   47.   14.5  22.   20.   17.   21.   70.5  29.   24.    2.
 21.   29.   32.5  32.5  54.   12.   29.   24.   29.   45.   33.   20.
 47.   29.   25.   23.   19.   37.   16.   24.   29.   22.   24.   19.
 18.   19.   27.    9.   36.5  42.   51.   22.   55.5  40.5  29.   51.
 16.   30.   29.   29.   44.   40.   26.   17.    1.    9.   29.   45.
 29.  

In [4]:

passengers["FirstClass"] = passengers["Pclass"].apply(lambda x: 1 if x == 1 else 0)

# Create a second class column
passengers["SecondClass"] = passengers["Pclass"].apply(lambda x: 1 if x == 2 else 0)
print(passengers.head(10))

# Select the desired features
features = passengers[["Sex", "Age", "FirstClass", "SecondClass"]]
survival = passengers["Survived"]
print(features.head())

# Perform train, test, split
X_train, X_test, y_train, y_test = train_test_split(features,survival,test_size = 0.2)

# Scale the feature data so it has mean = 0 and standard deviation = 1
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

# Create and train the model
regressor = LogisticRegression()
regressor.fit(X_train, y_train)

# Score the model on the train data
print(regressor.score(X_train, y_train))

# Score the model on the test data
print(regressor.score(X_test, y_test))


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   
5                                   Moran, Mr. James    0  29.0      0      0   
6                            McCarth

In [5]:

# Analyze the coefficients
print(regressor.coef_)
print(list(zip(['Sex','Age','FirstClass','SecondClass'],regressor.coef_[0])))

# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([0.0,24.0,1.0,0.0])


[[ 1.25077059 -0.49831984  0.97668128  0.48889664]]
[('Sex', 1.250770594037442), ('Age', -0.49831984322532813), ('FirstClass', 0.9766812751116036), ('SecondClass', 0.4888966371842996)]


In [6]:

# Combine passenger arrays
all_data = np.array([ Jack , Rose, You ])
print(all_data)

# Scale the sample passenger features
sample_passengers = scale.transform(all_data)
print(sample_passengers)

# Make survival predictions!
print(regressor.predict(sample_passengers))
print(regressor.predict_proba(sample_passengers))


[[ 0. 20.  0.  0.]
 [ 1. 17.  1.  0.]
 [ 0. 24.  1.  0.]]
[[-0.74698519 -0.73596344 -0.56004744 -0.53403984]
 [ 1.33871463 -0.96426627  1.78556302 -0.53403984]
 [-0.74698519 -0.43155968  1.78556302 -0.53403984]]
[0 1 1]
[[0.88031487 0.11968513]
 [0.04661936 0.95338064]
 [0.46410981 0.53589019]]
