In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
class TitanicData(Dataset):
    """
    Custom dataset class for handling the Titanic survivor dataset.
    """

    def __init__(self, df_path, current_fold, num_fold=5):
        """
        Load the Titanic dataset and perform preprocessing.

        Args:
            df_path (str): Path to the Titanic dataset CSV file.
            current_fold (int): The current fold of the dataset.
            num_fold (int): The total number of folds to split the dataset into.
        """
        super().__init__()

        self.df = self.preprocess_data(df_path)
        self.num_fold = num_fold
        self.current_fold = current_fold

        # Use KFold to split the dataset into 'num_fold' folds
        self.kf = KFold(n_splits=num_fold, shuffle=True, random_state=42)

    def preprocess_data(self, df_path):
        """
        Reads the Titanic dataset CSV file and performs preprocessing.

        Args:
            df_path (str): Path to the Titanic dataset CSV file.

        Returns:
            DataFrame: A Pandas DataFrame containing the preprocessed Titanic dataset.
        """
        df = pd.read_csv(df_path)

        # Handle missing values
        median_age = df['Age'].median()
        df['Age'].fillna(median_age, inplace=True)

        mode_embarked = df['Embarked'].mode()[0]
        df['Embarked'].fillna(mode_embarked, inplace=True)

        median_fare = df['Fare'].median()
        df['Fare'].fillna(median_fare, inplace=True)

        mode_pclass = df['Pclass'].mode()[0]
        df['Pclass'].fillna(mode_pclass, inplace=True)

        df.drop(columns=['Cabin'], inplace=True)
        df.dropna(subset=['Embarked'], inplace=True)

        # Create LabelEncoder instances for 'Sex' and 'Embarked' columns
        sex_label_encoder = LabelEncoder()
        embarked_label_encoder = LabelEncoder()

        # Label encode the categorical columns
        df['Sex'] = sex_label_encoder.fit_transform(df['Sex'])
        df['Embarked'] = embarked_label_encoder.fit_transform(df['Embarked'])

        # Create a binary 'Survived' column
        df['Survived'] = df['Survived'].astype(int)

        return df

    def __len__(self):
        """
        Returns the length of the dataset.
        """
        return len(self.df)

    def __getitem__(self, idx):
        """
        Retrieves the features and label at the given index.

        Args:
            idx (int): The index of the dataset element to retrieve.

        Returns:
            dict: A dictionary containing the features and label of the dataset at the given index.
        """

        # Extract passenger data
        row = self.df.iloc[idx]
        pclass = row['Pclass'] # Passenger class
        sex = row['Sex']
        age = row['Age']
        sibsp = row['SibSp'] # Number of siblings/spouses aboard
        parch = row['Parch'] # Number of parents/children aboard
        fare = row['Fare']
        embarked = row['Embarked'] # Port of embarkation

        # Create feature tensor
        features = torch.tensor([pclass, sex, age, sibsp, parch, fare, embarked], dtype=torch.float)

        # Create label tensor
        label = torch.tensor(row['Survived'], dtype=torch.long)

        return {
            'features': features,
            'label': label,
        }

    def get_splits(self):
        """
        Splits the dataset into training and validation subsets.

        Returns:
            tuple: A tuple containing the training and validation subsets.
        """

        fold_data = list(self.kf.split(self.df))
        train_indices, val_indices = fold_data[self.current_fold]

        train_data = self._get_subset(train_indices)
        val_data = self._get_subset(val_indices)

        return train_data, val_data

    def _get_subset(self, indices):
        """
        Returns a Subset of the dataset at the given indices.

        Args:
            indices (list): A list of indices specifying the subset of the dataset to return.

        Returns:
            Subset: A Subset of the dataset at the given indices.
        """
        return Subset(self, indices)

In [4]:
import torch.nn.functional as F

class SimpleFeedForwardNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleFeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x

class Config:
    input_size = 7  # Number of input features
    hidden_size = 32  # Size of the hidden layer
    output_size = 1
    learning_rate = 0.003
    num_epochs = 256
    batch_size = 64
    num_fold = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

criterion = nn.BCEWithLogitsLoss()

fold_results = []  # Store validation results for each fold
fold_models = [] # Store model trained on each fold
for fold in range(Config.num_fold):
    # Create data handler for the current fold
    data_handler = TitanicData(df_path='titanic.csv', current_fold=fold, num_fold=Config.num_fold)

    # Split the dataset into training and validation subsets
    train_data, val_data = data_handler.get_splits()

    # Create data loaders using the batch_size from the Config class
    train_loader = DataLoader(train_data, batch_size=Config.batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=Config.batch_size)

    # Create a new instance of the model for each fold
    model = SimpleFeedForwardNN(Config.input_size, Config.hidden_size, Config.output_size)
    model.to(device)

    # Define a new optimizer for each fold to reset the model parameters
    optimizer = optim.Adam(model.parameters(), lr=Config.learning_rate)

    # Training loop for the current fold
    for epoch in range(Config.num_epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            
            # Retrieve features and labels from the current batch
            features = batch['features'].to(device)
            labels = batch['label'].float().view(-1, 1).to(device)  # Reshape labels to (batch_size, 1)

            # Forward pass
            outputs = model(features)

            # Calculate the loss using BCEWithLogitsLoss
            loss = criterion(outputs, labels)

            # Backpropagation and optimization with gradient clipping
            optimizer.zero_grad()
            loss.backward()

            if torch.isnan(loss):
                print("NaN loss detected. Check your data and training settings.")
                break

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Add gradient clipping
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)

        # Validation for the current fold
        model.eval()
        val_total_loss = 0.0
        all_labels = []
        all_predictions = []
        with torch.no_grad():
            for batch in val_loader:
                # Your validation code here
                features = batch['features'].to(device)
                labels = batch['label'].float().view(-1, 1).to(device)  # Convert labels to float for BCE loss

                # Forward pass
                outputs = model(features)
                #print(outputs)

                val_loss = criterion(outputs, labels)
                val_total_loss += val_loss.item()

                predictions = (outputs>0.5).float()
                #print(predictions)

                all_labels.extend(labels.tolist())
                all_predictions.extend(predictions.tolist())

        average_val_loss = val_total_loss / len(val_loader)

        # Calculate accuracy for the current fold
        accuracy = accuracy_score(all_labels, all_predictions)
        print(f'Fold [{fold + 1}/{Config.num_fold}] - Epoch [{epoch + 1}/{Config.num_epochs}] - Loss: {average_loss:.4f} - Validation Loss: {average_val_loss:.4f} - Validation Accuracy: {accuracy:.4f}')

    # Store validation results for the current fold
    fold_results.append(accuracy)

    # Save the model for the current fold
    fold_models.append(model)

# Calculate and print the average validation accuracy across all folds
average_accuracy = sum(fold_results) / len(fold_results)
print(f'Average Validation Accuracy: {average_accuracy:.4f} across {Config.num_fold} folds')

Fold [1/3] - Epoch [1/256] - Loss: 0.7856 - Validation Loss: 0.7119 - Validation Accuracy: 0.5960
Fold [1/3] - Epoch [2/256] - Loss: 0.7031 - Validation Loss: 0.6949 - Validation Accuracy: 0.5960
Fold [1/3] - Epoch [3/256] - Loss: 0.6942 - Validation Loss: 0.6931 - Validation Accuracy: 0.5960
Fold [1/3] - Epoch [4/256] - Loss: 0.6930 - Validation Loss: 0.6927 - Validation Accuracy: 0.5960
Fold [1/3] - Epoch [5/256] - Loss: 0.6927 - Validation Loss: 0.6924 - Validation Accuracy: 0.5960
Fold [1/3] - Epoch [6/256] - Loss: 0.6920 - Validation Loss: 0.6920 - Validation Accuracy: 0.5960
Fold [1/3] - Epoch [7/256] - Loss: 0.6915 - Validation Loss: 0.6898 - Validation Accuracy: 0.5993
Fold [1/3] - Epoch [8/256] - Loss: 0.6914 - Validation Loss: 0.6840 - Validation Accuracy: 0.6195
Fold [1/3] - Epoch [9/256] - Loss: 0.6844 - Validation Loss: 0.6743 - Validation Accuracy: 0.6566
Fold [1/3] - Epoch [10/256] - Loss: 0.6846 - Validation Loss: 0.6711 - Validation Accuracy: 0.6667
Fold [1/3] - Epoch 

In [5]:
# choose the model that performs that best on the entire dataset
test_data = TitanicData(df_path='titanic.csv', current_fold=0, num_fold=2)
test_loader = DataLoader(test_data, batch_size=Config.batch_size)

losses = []
for model in fold_models:
    all_labels = []
    all_predictions = []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            features = batch['features'].to(device)
            labels = batch['label'].float().view(-1, 1).to(device)
            predictions = model(features)
            all_predictions.extend(predictions.tolist())
            all_labels.extend(labels.tolist())
    loss = criterion(torch.tensor(all_predictions), torch.tensor(all_labels))
    losses.append(loss.item())

best_model = fold_models[np.argmin(losses)]

# Save the best model
torch.save(best_model.state_dict(), 'best_model.pt')
