In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

## Exploratory Data Analysis

In [82]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [83]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Handling missing values in train_data

In [84]:
train_data.Age.fillna(train_data.groupby(['Pclass']).Age.transform('mean'), inplace = True)

## Implementing logistic regression from scratch

In [85]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [86]:
def calculate_loss(Y, y_hat):
    """Calculates the binary cross entropy loss"""
    m = len(Y)
    loss = (-1/m)*np.sum((Y*np.log(y_hat)) + (1-Y)*np.log(1-y_hat))
    return loss

In [87]:
def gradient_descent(X, Y, num_iterations, learning_rate):
    theta = np.zeros(X.shape[0]).reshape(-1, 1) # initializing weights and bias
    m = len(Y)
    
    y_hat = sigmoid(np.dot(X.T, theta))
    cost = calculate_loss(Y, y_hat)
    
    for i in range(num_iterations):
        theta = theta - learning_rate*((1/m)*np.dot(X, (y_hat-Y)))
        y_hat = sigmoid(np.dot(X.T, theta))
        cost = calculate_loss(Y, y_hat)
    
    return y_hat, theta, cost
    
    
    

## Testing on real data

In [130]:
Y = train_data['Survived'].values.reshape(-1,1)
dummies = pd.get_dummies(train_data[['Pclass', 'Sex']], drop_first=True)
x = pd.concat([dummies, train_data['Age']], axis=1).values
X = np.vstack([np.ones(x.shape[0]), x.T])

In [131]:
Y.shape, X.shape

((891, 1), (4, 891))

In [140]:
y_hat, params, cost = gradient_descent(X, Y, 50000, 0.01)

In [171]:
y_pred = np.where(y_hat > 0.5, 1, 0)

In [172]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [173]:
confusion_matrix(Y, y_pred)

array([[470,  79],
       [ 99, 243]])

In [174]:
print(classification_report(Y, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       549
           1       0.75      0.71      0.73       342

    accuracy                           0.80       891
   macro avg       0.79      0.78      0.79       891
weighted avg       0.80      0.80      0.80       891



In [175]:
accuracy_score(Y, y_pred)

0.8002244668911336