# Predict Titanic Survival

### Imports

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Load data

In [5]:
passengers = pd.read_csv("passengers.csv")
print(passengers)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

## Clean data

In [6]:
# Update sex column to numerical
passengers['Sex'] = passengers['Sex'].map({'female': 1, 'male': 0})
# replace values in the "Sex" column
# passengers["Sex"] = passengers["Sex"].replace({"female": 1, "male": 0})


# Fill the nan values in the age column
passengers['Age'].fillna(value=passengers['Age'].mean(), inplace=True)

# Create a first class column
passengers['FirstClass'] = passengers['Pclass'].apply( lambda x: 1 if x == 1 else 0)
# Create a second class column
passengers['SecondClass'] = passengers['Pclass'].apply( lambda x: 1 if x == 2 else 0)
# print(passengers[["Pclass", "FirstClass", "SecondClass"]])

### Select and Split the Data

In [7]:
# Select the desired features
features = passengers[["Sex", "Age", "FirstClass", "SecondClass"]]
survival =  passengers['Survived']

### Perform train, test, split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size=0.2)

### Normalize the data

In [9]:
# Scale the feature data so it has mean = 0 and standard deviation = 1
scaler = StandardScaler()
# To determine the scaling factors and apply the scaling to the feature data:
train_features = scaler.fit_transform(X_train)
# To apply the scaling to the test data:
test_features = scaler.transform(X_test)

### Create and train the model

In [10]:
model = LogisticRegression()
# To .fit() the model to training data:
model.fit(X_train, y_train)

### Score the model on the train data

Scoring the model on the training data will run the data through the model and make final classifications on survival for each passenger in the training set. The score returned is the percentage of correct classifications, or the accuracy.

In [11]:
train_accuracy = model.score(X_train, y_train)
print(train_accuracy)

# Score the model on the test data
test_accuracy = model.score(X_test, y_test)
print(test_accuracy)

0.7963483146067416
0.7988826815642458


### Analyze the coefficients

In [12]:
print(list(zip(['Sex','Age','FirstClass','SecondClass'],model.coef_[0])))

[('Sex', 2.4777699639163084), ('Age', -0.027066350910048366), ('FirstClass', 2.153739030084268), ('SecondClass', 1.1348530794502656)]


### Predict with the Model

In [13]:
# Sample passenger features
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
# You = np.array([___,___,___,___])

# Combine passenger arrays
sample_passengers = np.array([Jack , Rose])

# Scale the sample passenger features
sample_passengers = scaler.transform(sample_passengers)
#print(sample_passengers)
# Make survival predictions!
prediction = model.predict(sample_passengers)
print(prediction)
# To predict the probabilities of survival for sample_passengers:
prediction_prob = model.predict_proba(sample_passengers)
print(prediction_prob)

[0 1]
[[0.99382368 0.00617632]
 [0.00578585 0.99421415]]


