In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [19]:
# Load the passenger data
passengers = pd.read_csv("passengers.csv")

# Update sex column to numerical
passengers["Sex"] = passengers["Sex"].apply(lambda x: 1 if x=="male" else 0)
# print(passengers.head())
# print(passengers.dtypes)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [5]:
# Fill the nan values in the age column
passengers["Age"] = passengers["Age"].fillna(passengers["Age"].mean())

# Create a first class column
passengers["FirstClass"] = passengers["Pclass"].apply(lambda x: 1 if x ==1 else 0)
# print(passengers.head())

In [20]:
# Create a second class column
passengers["SecondClass"] = passengers["Pclass"].apply(lambda x: 1 if x==2 else 0)
passengers.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,0


In [7]:
# Select the desired features
features = passengers[["Sex","Age", "FirstClass", "SecondClass"]]
survival = passengers["Survived"]

In [8]:
# Perform train, test, split
x_train, x_test, y_train, y_test = train_test_split(features,survival, train_size = 0.8, test_size = 0.2)
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
# Create and train the model
model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression()

In [10]:
# Score the model on the train data
print(model.score(x_train,y_train))

0.7851123595505618


In [11]:
# Score the model on the test data
print(model.score(x_test, y_test))


0.7877094972067039


In [12]:
# Analyze the coefficients
print(model.coef_)

[[-1.18312033 -0.45932349  1.04616646  0.47236332]]


In [13]:
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You  = np.array([0.0,23.0,1.0,0.0])


In [14]:
# Combine passenger arrays
sample_passengers =np.array([Jack,Rose,You])

In [15]:
# Scale the sample passenger features
sample_passengers =scaler.transform(sample_passengers)
print(sample_passengers)

[[-1.36783323 -0.78553971 -0.57951264 -0.52098807]
 [ 0.73108328 -1.01390656  1.72558791 -0.52098807]
 [-1.36783323 -0.55717286  1.72558791 -0.52098807]]


In [16]:
# Make survival predictions!
prediction = model.predict(sample_passengers)
print(prediction)

[1 1 1]


In [17]:
prediction_proba = model.predict_proba(sample_passengers)
print(prediction_proba)

[[0.35729586 0.64270414]
 [0.34972172 0.65027828]
 [0.05246364 0.94753636]]
