In [650]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

## Importing data into pandas dataframe

In [651]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [652]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [653]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Splitting the ticket info to two categories and normalizing the name

In [654]:
processed_train_df = pd.DataFrame()
processed_test_df = pd.DataFrame()

def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
    
processed_train_df = preprocess(train_df)
processed_test_df = preprocess(test_df)

In [655]:
processed_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,NONE


In [656]:
processed_test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,892,3,Kelly Mr James,male,34.5,0,0,330911,7.8292,,Q,330911,NONE
1,893,3,Wilkes Mrs James Ellen Needs,female,47.0,1,0,363272,7.0,,S,363272,NONE
2,894,2,Myles Mr Thomas Francis,male,62.0,0,0,240276,9.6875,,Q,240276,NONE
3,895,3,Wirz Mr Albert,male,27.0,0,0,315154,8.6625,,S,315154,NONE
4,896,3,Hirvonen Mrs Alexander Helga E Lindqvist,female,22.0,1,1,3101298,12.2875,,S,3101298,NONE


now we have 2 dataframes for training and testing the model

## Preparing the train dataset

In [657]:
train_data = processed_train_df.drop(columns=["PassengerId", "Survived", "Ticket"], inplace=False)
train_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,3,Braund Mr Owen Harris,male,22.0,1,0,7.25,,S,21171,A/5
1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,71.2833,C85,C,17599,PC
2,3,Heikkinen Miss Laina,female,26.0,0,0,7.925,,S,3101282,STON/O2.
3,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,53.1,C123,S,113803,NONE
4,3,Allen Mr William Henry,male,35.0,0,0,8.05,,S,373450,NONE


In [658]:
dummies = pd.get_dummies(train_data["Sex"])
train_data = pd.concat([train_data, dummies], axis=1)
train_data.drop(["Sex"], axis=1, inplace=True)
train_data.head()

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_number,Ticket_item,female,male
0,3,Braund Mr Owen Harris,22.0,1,0,7.25,,S,21171,A/5,False,True
1,1,Cumings Mrs John Bradley Florence Briggs Thayer,38.0,1,0,71.2833,C85,C,17599,PC,True,False
2,3,Heikkinen Miss Laina,26.0,0,0,7.925,,S,3101282,STON/O2.,True,False
3,1,Futrelle Mrs Jacques Heath Lily May Peel,35.0,1,0,53.1,C123,S,113803,NONE,True,False
4,3,Allen Mr William Henry,35.0,0,0,8.05,,S,373450,NONE,False,True


In [659]:
dummies = pd.get_dummies(train_data["Embarked"])
train_data = pd.concat([train_data, dummies], axis=1)
train_data.drop("Embarked", axis=1, inplace=True)
train_data.head()

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Fare,Cabin,Ticket_number,Ticket_item,female,male,C,Q,S
0,3,Braund Mr Owen Harris,22.0,1,0,7.25,,21171,A/5,False,True,False,False,True
1,1,Cumings Mrs John Bradley Florence Briggs Thayer,38.0,1,0,71.2833,C85,17599,PC,True,False,True,False,False
2,3,Heikkinen Miss Laina,26.0,0,0,7.925,,3101282,STON/O2.,True,False,False,False,True
3,1,Futrelle Mrs Jacques Heath Lily May Peel,35.0,1,0,53.1,C123,113803,NONE,True,False,False,False,True
4,3,Allen Mr William Henry,35.0,0,0,8.05,,373450,NONE,False,True,False,False,True


In [660]:
train_data["male"] = train_data["male"].astype(int)
train_data["female"] = train_data["female"].astype(int)
train_data["C"] = train_data["C"].astype(int)
train_data["Q"] = train_data["Q"].astype(int)
train_data["S"] = train_data["S"].astype(int)


In [661]:
def count_cabins(cabin):
    if pd.isna(cabin):  
        return 0
    else:
        return cabin.count(' ') + 1  
    
train_data['num_cabins'] = train_data['Cabin'].apply(count_cabins)

In [662]:
def has_ticket_item(item):
    if item == "NONE":
        return 0
    else:
        return 1
    
train_data['has_ticket_item'] = train_data['Ticket_item'].apply(has_ticket_item)

In [663]:
def ticket_number_to_int(x):
    if x == "LINE":
        return 0
    else:
        return int(x)
    
train_data["Ticket_number"] = train_data["Ticket_number"].apply(ticket_number_to_int)
train_data["Ticket_number"] = train_data["Ticket_number"].astype(int)

In [664]:
for column in train_data.columns:
    if train_data[column].dtype == 'int64':
        train_data[column] = train_data[column].astype('float64')

### Dropping the name

In [665]:
train_data.drop(["Name"], inplace= True, axis= 1)

In [666]:
train_data = train_data[["Pclass", "female", "male", "Age", "SibSp", "Parch", "Fare", "num_cabins", "C", "Q", "S", "Ticket_number", "has_ticket_item"]]
train_data.head()

Unnamed: 0,Pclass,female,male,Age,SibSp,Parch,Fare,num_cabins,C,Q,S,Ticket_number,has_ticket_item
0,3.0,0.0,1.0,22.0,1.0,0.0,7.25,0.0,0.0,0.0,1.0,21171.0,1.0
1,1.0,1.0,0.0,38.0,1.0,0.0,71.2833,1.0,1.0,0.0,0.0,17599.0,1.0
2,3.0,1.0,0.0,26.0,0.0,0.0,7.925,0.0,0.0,0.0,1.0,3101282.0,1.0
3,1.0,1.0,0.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,1.0,113803.0,0.0
4,3.0,0.0,1.0,35.0,0.0,0.0,8.05,0.0,0.0,0.0,1.0,373450.0,0.0


# Results

In [667]:
train_data.head()

Unnamed: 0,Pclass,female,male,Age,SibSp,Parch,Fare,num_cabins,C,Q,S,Ticket_number,has_ticket_item
0,3.0,0.0,1.0,22.0,1.0,0.0,7.25,0.0,0.0,0.0,1.0,21171.0,1.0
1,1.0,1.0,0.0,38.0,1.0,0.0,71.2833,1.0,1.0,0.0,0.0,17599.0,1.0
2,3.0,1.0,0.0,26.0,0.0,0.0,7.925,0.0,0.0,0.0,1.0,3101282.0,1.0
3,1.0,1.0,0.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,1.0,113803.0,0.0
4,3.0,0.0,1.0,35.0,0.0,0.0,8.05,0.0,0.0,0.0,1.0,373450.0,0.0


In [668]:
train_data.dtypes

Pclass             float64
female             float64
male               float64
Age                float64
SibSp              float64
Parch              float64
Fare               float64
num_cabins         float64
C                  float64
Q                  float64
S                  float64
Ticket_number      float64
has_ticket_item    float64
dtype: object

In [669]:
train_data_survived = processed_train_df["Survived"]
train_data_survived = train_data_survived.astype("int64")

train_data_survived.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# Now, we can create the dataset and model

In [670]:
class CustomDataset(Dataset):
    def __init__(self, data, labels, train):
        self.data = torch.tensor(data.values)
        self.labels = torch.tensor(labels.values)

        if train:
            self.data = self.data[0:668]
            self.labels = self.labels[0:668]
        else:
            self.data = self.data[668:]
            self.labels = self.labels[668:]

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
    

In [671]:
train_dataset =  CustomDataset(train_data, train_data_survived, train= True)
validation_dataset =  CustomDataset(train_data, train_data_survived, train= False)

print(f"length of train dataset: {len(train_dataset)}\n length of val dataset: {len(validation_dataset)}")

length of train dataset: 668
 length of val dataset: 223


In [672]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=1, shuffle=True)

In [673]:
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(13, 4) 
        self.fc2 = nn.Linear(4, 1)  

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))  
        x = torch.sigmoid(self.fc2(x)) 
        return x


model = BinaryClassifier()

In [674]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [686]:
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()  
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs = inputs.float()
            targets = targets.float()
            inputs_normalized = (inputs - inputs.min()) / (inputs.max() - inputs.min())  
            targets = targets.view(-1, 1)
            optimizer.zero_grad() 
            outputs = model(inputs_normalized)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
        
        train_loss = train_loss / len(train_loader.dataset)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}')
        
        
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs = inputs.float()
                targets = targets.float()
                inputs_normalized = (inputs - inputs.min()) / (inputs.max() - inputs.min())  
                targets = targets.view(-1, 1)
                outputs = model(inputs_normalized)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)
                
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
        
        val_loss = val_loss / len(test_loader.dataset)
        val_accuracy = correct / total
        print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

In [688]:
train_model(model, train_dataloader, validation_dataloader, criterion, optimizer, num_epochs=25)

RuntimeError: all elements of input should be between 0 and 1

# Testing the accuracy of model with unseen data (test_df)

## Applying same steps for the test data

In [None]:
test_data = processed_test_df.drop(columns=["PassengerId", "Ticket"], inplace= False)
test_data.head()

In [None]:
dummies = pd.get_dummies(test_data["Sex"])
test_data = pd.concat([test_data, dummies], axis=1)
test_data.drop(["Sex"], axis=1, inplace=True)

dummies = pd.get_dummies(test_data["Embarked"])
test_data = pd.concat([test_data, dummies], axis=1)
test_data.drop("Embarked", axis=1, inplace=True)

test_data["male"] = test_data["male"].astype(int)
test_data["female"] = test_data["female"].astype(int)
test_data["C"] = test_data["C"].astype(int)
test_data["Q"] = test_data["Q"].astype(int)
test_data["S"] = test_data["S"].astype(int)

def count_cabins(cabin):
    if pd.isna(cabin):  
        return 0
    else:
        return cabin.count(' ') + 1  
    
test_data['num_cabins'] = test_data['Cabin'].apply(count_cabins)

def has_ticket_item(item):
    if item == "NONE":
        return 0
    else:
        return 1
    
test_data['has_ticket_item'] = test_data['Ticket_item'].apply(has_ticket_item)

def ticket_number_to_int(x):
    if x == "LINE":
        return 0
    else:
        return int(x)
    
test_data["Ticket_number"] = test_data["Ticket_number"].apply(ticket_number_to_int)
test_data["Ticket_number"] = test_data["Ticket_number"].astype(int)

for column in test_data.columns:
    if test_data[column].dtype == 'int64':
        test_data[column] = test_data[column].astype('float64')

test_data.drop(["Name"], inplace= True, axis= 1)




In [None]:
test_data = test_data[["Name", "Pclass", "female", "male", "Age", "SibSp", "Parch", "Fare", "num_cabins", "C", "Q", "S", "Ticket_number", "has_ticket_item"]]
test_data.head()