#Logistic Regression on Titanic Dataset

##Import the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Download the Data

In [2]:
!wget -nc https://raw.githubusercontent.com/meetnandu05/LogisticRegression/master/titanic_train.csv
!wget -nc https://raw.githubusercontent.com/meetnandu05/LogisticRegression/master/titanic_test.csv

File ‘titanic_train.csv’ already there; not retrieving.

File ‘titanic_test.csv’ already there; not retrieving.



In [3]:
train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


##Exploratory Data Analysis

###Missing Data

In [5]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [6]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
test['Age'] = test[['Age','Pclass']].apply(impute_age,axis=1)

In [7]:
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)
train.dropna(inplace=True)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


##Converting Categorical Features

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [9]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
train = pd.concat([train,sex,embark],axis=1)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


##Building a Logistic Regression model

###Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    train.drop('Survived', axis=1), 
    pd.DataFrame(train['Survived']), 
    test_size=0.20, 
    random_state=101)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(711, 9)
(178, 9)
(711, 1)
(178, 1)


##Training and Predicting

In [12]:
import torch
import torch.nn as nn

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [13]:
class ClassificationNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 5),
            nn.ReLU(),
            nn.Linear(5, 3),
            nn.Tanh(),
            nn.Linear(3, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
      return self.model(x)

classification_nn_model = ClassificationNN(X_train.shape[1], 1)
print(classification_nn_model)

ClassificationNN(
  (model): Sequential(
    (0): Linear(in_features=9, out_features=5, bias=True)
    (1): ReLU()
    (2): Linear(in_features=5, out_features=3, bias=True)
    (3): Tanh()
    (4): Linear(in_features=3, out_features=1, bias=True)
    (5): Sigmoid()
  )
)


In [14]:
X_train

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S
307,308,1,17.0,1,0,108.9000,0,0,0
229,230,3,24.0,3,1,25.4667,0,0,1
82,83,3,24.0,0,0,7.7875,0,1,0
353,354,3,25.0,1,0,17.8000,1,0,1
707,708,1,42.0,0,0,26.2875,1,0,1
...,...,...,...,...,...,...,...,...,...
576,577,2,34.0,0,0,13.0000,0,0,1
840,841,3,20.0,0,0,7.9250,1,0,1
338,339,3,45.0,0,0,8.0500,1,0,1
524,525,3,24.0,0,0,7.2292,1,0,0


In [15]:
def train_model(X_train, Y_train, model, epochs=50000, verbose=True):
    X = torch.tensor(X_train.values, dtype=torch.float)
    y = torch.tensor(Y_train.values, dtype=torch.float)
    loss_function = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(X)
        loss = loss_function(output, y)
        loss.backward()
        optimizer.step()
        if(verbose and epoch % (epochs/10) == 0):
            print(("Epoch :{0:>6} ----> Training Loss: {1}").format(str(epoch), str(loss.item())))
    return model


In [16]:
trained_model = train_model(X_train, y_train, classification_nn_model)

Epoch :     0 ----> Training Loss: 0.7348843812942505
Epoch :  5000 ----> Training Loss: 0.4158627390861511
Epoch : 10000 ----> Training Loss: 0.41242316365242004
Epoch : 15000 ----> Training Loss: 0.4228770434856415
Epoch : 20000 ----> Training Loss: 0.420942485332489
Epoch : 25000 ----> Training Loss: 0.43776190280914307
Epoch : 30000 ----> Training Loss: 0.43002215027809143
Epoch : 35000 ----> Training Loss: 0.4129955768585205
Epoch : 40000 ----> Training Loss: 0.40352317690849304
Epoch : 45000 ----> Training Loss: 0.40009015798568726


##Evaluation

In [17]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

def print_metrics(X, Y):
  y_true = Y
  X_tensor = torch.tensor(X.values, dtype=torch.float)
  scores = trained_model(X_tensor).detach().numpy()
  predictions = np.array(scores)
  predictions[scores >= 0.5] = 1
  predictions[scores < 0.5] = 0 
  print(classification_report(y_true, predictions))
  print("Accuracy:", accuracy_score(y_true, predictions))

  print()
  print("Precision:", precision_score(y_true, predictions))
  print("Recall:", recall_score(y_true, predictions))
  print("F1-Score:", f1_score(y_true, predictions))


print("------------------------------ Train ------------------------------")
print_metrics(X_train, y_train)
print()

print("------------------------------ Test -------------------------------")
print_metrics(X_test, y_test)
print()


------------------------------ Train ------------------------------
              precision    recall  f1-score   support

           0       0.81      0.95      0.88       442
           1       0.89      0.64      0.75       269

    accuracy                           0.83       711
   macro avg       0.85      0.80      0.81       711
weighted avg       0.84      0.83      0.83       711

Accuracy: 0.8340365682137834

Precision: 0.8871794871794871
Recall: 0.6431226765799256
F1-Score: 0.7456896551724138

------------------------------ Test -------------------------------
              precision    recall  f1-score   support

           0       0.76      0.95      0.85       107
           1       0.89      0.55      0.68        71

    accuracy                           0.79       178
   macro avg       0.82      0.75      0.76       178
weighted avg       0.81      0.79      0.78       178

Accuracy: 0.7921348314606742

Precision: 0.8863636363636364
Recall: 0.5492957746478874
F1-Sco