#Logistic Regression on Titanic Dataset

##Import the required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Download the Data

In [None]:
!wget -nc https://raw.githubusercontent.com/meetnandu05/LogisticRegression/master/titanic_train.csv
!wget -nc https://raw.githubusercontent.com/meetnandu05/LogisticRegression/master/titanic_test.csv

File ‘titanic_train.csv’ already there; not retrieving.

File ‘titanic_test.csv’ already there; not retrieving.



In [None]:
train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


##Exploratory Data Analysis

###Missing Data

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [None]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
test['Age'] = test[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)
train.dropna(inplace=True)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


##Converting Categorical Features

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [None]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
train = pd.concat([train,sex,embark],axis=1)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


##Building a Logistic Regression model

###Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    train.drop('Survived', axis=1), 
    train['Survived'], 
    test_size=0.20, 
    random_state=101)

##Training and Predicting

In [None]:
class LogisticRegression:
    def __init__(self, lr=0.01, epochs=10000):
        # Hyper-parameters
        self.lr = lr
        self.epochs = epochs

        # parameters
        self.params = None
        self.classes_ = None

    @staticmethod
    def normalize_data(X):
        return (X - np.mean(X, 0)) / np.std(X, 0)

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def probability(self, theta, x):
        return self.sigmoid(np.dot(x, theta))

    def gradient(self, theta, X, y):
        m = X.shape[0]
        z = np.dot(X, theta)
        y_hat = self.sigmoid(z)
        return (1 / m) * np.dot(X.T, y_hat - y)

    def fit(self, X, y):
        X = self.normalize_data(X)
        intercept = np.ones((X.shape[0], 1))
        X = np.concatenate((intercept, X), axis=1)
        y = np.array(y).reshape(y.shape[0], 1)

        self.classes_ = np.unique(y)
        self.params = np.zeros((X.shape[1], 1))

        for _ in range(self.epochs):
            grad = self.gradient(self.params, X, y)
            self.params -= self.lr * grad

    def predict(self, X):
        X = self.normalize_data(X)
        intercept = np.ones((X.shape[0], 1))
        X = np.concatenate((intercept, X), axis=1)

        scores = np.dot(X, self.params)
        indices = (scores > 0).astype(int)
        return self.classes_[indices]

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
pred_train = logmodel.predict(X_train)
pred_test = logmodel.predict(X_test)

##Evaluation

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

def print_metrics(y_true, y_pred):
    print("--------------------------------------------------------")
    print(classification_report(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1-Score:", f1_score(y_true, y_pred))
    print("--------------------------------------------------------")

print("Train Dataset:")
print_metrics(y_train, pred_train)
print("\n\n")

print("Test Dataset:")
print_metrics(y_test, pred_test)

Train Dataset:
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       442
           1       0.77      0.71      0.74       269

    accuracy                           0.81       711
   macro avg       0.80      0.79      0.80       711
weighted avg       0.81      0.81      0.81       711

Accuracy: 0.8115330520393812
Precision: 0.7732793522267206
Recall: 0.7100371747211895
F1-Score: 0.7403100775193798
--------------------------------------------------------



Test Dataset:
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       107
           1       0.84      0.66      0.74        71

    accuracy                           0.81       178
   macro avg       0.82      0.79      0.80       178
weighted avg       0.82      0.81      0.81       178

Accuracy: 0.8146067415730337