In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import numpy as np
import os

df = pd.read_csv('../data/train.csv')
df.head()

# Create 2 DataFrames (1 for everything except 'Survived', the other only has 'Survived')
# This enables us to separate the 'answer' from the features used to find the 'answer.'
X = df.drop('Survived', axis=1)
y = df[['Survived']]

In [2]:
# Use Sklearn to split data into Training data and Validation data. Validation size = test_size
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state = 10)

In [3]:
# train shapes [0] column should match, val shapes [0] column should match
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((712, 11), (179, 11), (712, 1), (179, 1))

In [4]:
# Bring the Training Data features and 'answer' together for training.
# Do the same for the validation data
df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

In [5]:
# Beginning EDA by creating graphs
survived = df_train['Survived'].value_counts()[1]
died = df_train['Survived'].value_counts()[0]
labels = ['Survived', 'Died']
height = [survived, died]

#plt.bar(labels, height)
#plt.show()
#plt.close();

In [6]:
# Checking the ratio of Class 1 passengers vs. survivors of Class 1
class1 = df_train['Pclass'].value_counts()[1]

class1_survive = df_train.loc[df_train['Survived'] == 1]
class1_survive = class1_survive.loc[class1_survive['Pclass'] == 1]
class1_survive = class1_survive['Pclass'].value_counts()[1]

class1_ratio = class1_survive / class1

# Roughly 62% of Class 1 passengers survived
# Q: How do the other 2 look?

In [7]:
#train.groupby(['Sex', 'Pclass']).size().unstack(level=1).plot(kind='bar')
#plt.show()
#plt.close();

In [8]:
# Beginnning to find the AVG age of survivals and deaths
survived_total = df_train.loc[df_train['Survived']==1]['Age']
died_total = df_train.loc[df_train['Survived']==0]['Age']

In [9]:
#Plotting these totals to compare visually
fig = plt.figure()

plt.subplot(111)
plt.hist([died_total, survived_total],
         bins=8,
         range=(0.5, 80),
         stacked=True)
plt.legend(('Died', 'Survived'), loc='best')
#plt.show()
plt.close();

In [10]:
# Making a new DF to dissect data
train_group_by = pd.DataFrame(df_train.groupby(['Sex', 'Pclass', 'Survived'])['Survived'].count())
#train_group_by

### Making the Logistic Regression

In [11]:
# Define variables for later use
X = df_train[['Pclass']]
y = df_train['Survived']

X.shape, y.shape

((712, 1), (712,))

In [12]:
# Start the model
model_LR = LogisticRegression()
model_LR.fit(X, y)

In [13]:
# Checking the numbers used in the logical regression formula
w_1 = model_LR.coef_[0]
w_0 = model_LR.intercept_
print(f'Model feature coefficient : {w_1}\nModel intercept/bias: {w_0}')

Model feature coefficient : [-0.79538448]
Model intercept/bias: [1.37307183]


In [14]:
# Seeing what the predictions from this model are
y_pred = pd.DataFrame(model_LR.predict(X))

In [15]:
# Testing the model vs. validation data
# We can do better...
X_val = df_val[['Pclass']]
print(f'train: {model_LR.score(X,y)}, test: {round(model_LR.score(X_val,y_val),3)}')

train: 0.6657303370786517, test: 0.732


### Checking Probability compared to changing threshold

In [16]:
X_val2 = df_val[['Pclass']]
estim_prob = model_LR.predict_proba(X_val2)
estim_prob = estim_prob.round(3)
estim_prob = pd.DataFrame(data=estim_prob, columns=model_LR.classes_)

In [17]:
threshold = 0.5 #<--- Change threshold here
pred = []
for item in estim_prob[1]:
    if item >= threshold:
        pred.append(1)
    else:
        pred.append(0)

estim_prob['prediction']=pred

In [18]:
# Calculate if this Threshold improves or not
((y_val.reset_index(drop=True)['Survived'] == estim_prob['prediction']).sum())/y_val.shape[0]

0.7318435754189944

### Trying out multiple features

In [19]:
X1 = df_train[['Pclass', 'Fare']] # Optimally, use other features, but they need to be transormed 
y1 = df_train['Survived']
X1_val = df_val[['Pclass', 'Fare']]
X1.shape, y1.shape

((712, 2), (712,))

In [20]:
model_2_feats = LogisticRegression()
model_2_feats.fit(X1, y1)

In [21]:
print(f'train: {model_2_feats.score(X1,y1)}, test: {round(model_2_feats.score(X1_val,y_val),3)}')

train: 0.6657303370786517, test: 0.732


### 2.6 Feature Engineering

In [22]:
print(df_train.dtypes)
df_train.head()

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Survived         int64
dtype: object


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
57,58,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C,0
717,718,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27.0,0,0,34218,10.5,E101,S,1
431,432,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S,1
633,634,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,S,0
163,164,3,"Calic, Mr. Jovo",male,17.0,0,0,315093,8.6625,,S,0


In [23]:
# Data types
# PassengerId is categorical
# PClass is categorical
# Name is categorical
# Sex is binary (in this case)
# SibSp is categorical
# Parch is categorical
# Ticket is categorical
# Fare is metric
# Cabin is categorical
# Embarked is categorical
# Survived is binary

In [24]:
# Checking null values
df_train.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            135
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          554
Embarked         2
Survived         0
dtype: int64

In [25]:
# What to do?

# Drop unneeded columns
# OHE Pclass
# IMPUTE AND BIN Fare
# IMPUTE AND BIN Age
# IMPUTE AND OHE Embarked

In [26]:
df_train = df_train.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Survived'], axis=1)
df_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare
57,58,3,male,28.5,7.2292
717,718,2,female,27.0,10.5000
431,432,3,female,,16.1000
633,634,1,male,,0.0000
163,164,3,male,17.0,8.6625
...,...,...,...,...,...
369,370,1,female,24.0,69.3000
320,321,3,male,22.0,7.2500
527,528,1,male,,221.7792
125,126,3,male,12.0,11.2417


In [27]:
# Create pipeline for imputing and OHE
impute_and_ohe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False))               

In [28]:
# Create pipeline for imputing and binning
impute_and_binning = make_pipeline(SimpleImputer(strategy='median'), KBinsDiscretizer(encode='onehot-dense'))

In [29]:
# Creating a column transformer
trans = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['Pclass']),
    ('i_o', impute_and_ohe, ['Sex']),
    ('i_b', impute_and_binning, ['Age', 'Fare']),
], remainder = 'passthrough')
trans

In [30]:
# Fitting this transformer to the training data and then transforming
trans.fit(df_train)
df_train_tran = trans.transform(df_train)
pd.DataFrame(data=df_train_tran,columns=trans.get_feature_names_out())



Unnamed: 0,ohe__Pclass_1,ohe__Pclass_2,ohe__Pclass_3,i_o__Sex_female,i_o__Sex_male,i_b__Age_0.0,i_b__Age_1.0,i_b__Age_2.0,i_b__Age_3.0,i_b__Fare_0.0,i_b__Fare_1.0,i_b__Fare_2.0,i_b__Fare_3.0,i_b__Fare_4.0,remainder__PassengerId
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,58.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,718.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,432.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,634.0
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,164.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,370.0
708,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,321.0
709,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,528.0
710,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,126.0


In [31]:
# Create a new Logtistical Regression model, fitting to the transformed training data
model_trans = LogisticRegression()
X3 = df_train_tran
y = y # Above y = df_train['Survived']
model_trans.fit(X3, y)

In [32]:
# Checking score of training data
round(model_trans.score(X3,y),3)

0.794

In [33]:
# Transforming validation data into the correct format
df_val_tran = trans.transform(df_val)
pd.DataFrame(data=df_val_tran,columns=trans.get_feature_names_out())

Unnamed: 0,ohe__Pclass_1,ohe__Pclass_2,ohe__Pclass_3,i_o__Sex_female,i_o__Sex_male,i_b__Age_0.0,i_b__Age_1.0,i_b__Age_2.0,i_b__Age_3.0,i_b__Fare_0.0,i_b__Fare_1.0,i_b__Fare_2.0,i_b__Fare_3.0,i_b__Fare_4.0,remainder__PassengerId
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,591.0
1,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,132.0
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,629.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,196.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,231.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,457.0
175,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,192.0
176,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,604.0
177,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,95.0


In [34]:
# Checking score of validation data
round(model_trans.score(df_val_tran,y_val),3)

0.832

In [35]:
# Seeing my final predictions of validation data
val_pred = pd.DataFrame(model_trans.predict(df_val_tran))
#val_pred

# Checking if a Random Forest would improve performance or not

In [36]:
rf = RandomForestClassifier(n_estimators=50, max_depth=2)
rf.fit(X3, y) # Same variables used to fit the earlier Logistical Regression
rf.score(df_val_tran, y_val)

0.8100558659217877

# Test data time!

In [37]:
test = pd.read_csv('../data/test.csv')

In [38]:
test_tran = trans.transform(test)
test_tran = pd.DataFrame(data=test_tran,columns=trans.get_feature_names_out())
test_tran_id = test_tran['remainder__PassengerId']

In [39]:
final_pred = pd.DataFrame(model_trans.predict(test_tran))
final_pred = pd.merge(test_tran_id, final_pred, left_index=True, right_index=True)



In [40]:
final_pred.rename(columns={'remainder__PassengerId': 'PassengerId', 0: 'Survived'}, inplace=True)

In [41]:
final_pred['PassengerId'] = final_pred['PassengerId'].astype(int)
final_pred

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [43]:
final_pred.to_csv('first_prediction.csv', index=False)