In [None]:
import os
# data process
import numpy as np
import pandas as pd
from skimage import io, transform
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# plot
import matplotlib.pyplot as plt

# neural network
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# preprocessing
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

import warnings

In [None]:
data = pd.read_csv('/content/train.csv')
data.head()

NameError: name 'pd' is not defined

In [None]:
# Shared variables
num_epochs = 250
log_interval = 100

# Batch size: the amount of data for each training iteration
batch_size = 30

# Learning rate: Since we will create two different networks, we set two different learning rates
learning_rate = 0.001
multi_learning_rate = 0.001

# Hidden layers
multi_num_layers = 6

# Hidden neurons: Since we will create two different networks, we set two different numbers of hidden neurons

multi_neurons = 1024

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

class TitanicDataset(Dataset):
    def __init__(self, root_dir, train=True, transform=None):
        self.train = train
        self.transform = transform

        # Create MinMaxScaler for data preprocessing
        minmax_scaler = MinMaxScaler()
        onehot_enc = OneHotEncoder()

        # Read the Titanic data from the CSV file
        titanic = pd.read_csv(root_dir)

        # Drop unwanted columns
        if train:
          titanic = titanic[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived']]
        else:
          titanic = titanic[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]
        titanic.drop(columns=["Pclass", "Ticket", "Cabin", "Embarked", "Name"], inplace=True)

        # Fill missing values in "Age" and "Fare" with their respective medians
        if titanic["Age"].isna().any():
            titanic["Age"] = titanic["Age"].fillna(titanic["Age"].mean())

        if titanic["Fare"].isna().any():
            titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].mean())

        # Create a "Family" column and "isAlone" column
        titanic["Family"] = (titanic["SibSp"] + titanic["Parch"] + 1).astype(int)
        titanic["isAlone"] = titanic["Family"].apply(lambda x: 0 if x > 1 else 1).astype(int)

        # Drop the now redundant columns "SibSp" and "Parch"
        titanic.drop(columns=["SibSp", "Parch"], inplace=True)

        # Convert "Sex" to binary (0 for female, 1 for male)
        titanic["Sex"] = titanic["Sex"].map({"female": 0, "male": 1}).astype(int)

        titanic = titanic.dropna()
        titanic = titanic.reset_index(drop=True)

        # Split data into categorical, numerical features, and labels
        categorical_features = titanic.select_dtypes(include=['object']).columns.tolist()
        numerical_features = titanic.select_dtypes(include=['number']).drop(columns=["Survived"] if "Survived" in titanic.columns else []).columns.tolist()

        if self.train and "Survived" in titanic.columns:
            labels = titanic["Survived"]
        else:
            labels = None

        # Normalize numerical features
        titanic_numerical = titanic[numerical_features]
        titanic_numerical = pd.DataFrame(minmax_scaler.fit_transform(titanic_numerical), columns=numerical_features)

        # One-hot encode categorical features (if any)
        if categorical_features:
            titanic_categorical = pd.DataFrame(onehot_enc.fit_transform(titanic[categorical_features]), columns=onehot_enc.get_feature_names_out(categorical_features))
        else:
            titanic_categorical = pd.DataFrame()

        # Combine processed features
        features = pd.concat([titanic_numerical, titanic_categorical], axis=1)

        combined_data = pd.concat([features, labels], axis=1)
        train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)
        self.data = train_data.reset_index(drop=True)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      sample = self.data.iloc[idx]
      features = torch.FloatTensor(sample.values)

      if self.train and "Survived" in self.data.columns:
          label = torch.FloatTensor([sample['Survived']])
          features = features[:-1]  # Exclude the 'Survived' column
          return features, label
      else:
          return features


    def getData(self):
        return self.data


In [None]:
titanic_train = TitanicDataset(root_dir='/content/train.csv')
titanic_test = TitanicDataset(root_dir='/content/test.csv', train=False)

In [None]:
titanic_train.getData()

Unnamed: 0,Sex,Age,Fare,Family,isAlone,Survived
0,1.0,0.566474,0.055628,0.0,1.0,0
1,1.0,0.283740,0.025374,0.0,1.0,0
2,1.0,0.396833,0.015469,0.0,1.0,0
3,1.0,0.321438,0.015330,0.1,0.0,0
4,0.0,0.070118,0.061045,0.6,0.0,0
...,...,...,...,...,...,...
707,0.0,0.258608,0.014932,0.0,1.0,1
708,1.0,0.367921,0.060508,0.0,1.0,0
709,1.0,0.509927,0.027538,0.2,0.0,0
710,0.0,0.170646,0.234224,0.3,0.0,1


In [None]:
titanic_test.getData()

Unnamed: 0,Sex,Age,Fare,Family,isAlone
0,1.0,0.419755,0.025374,0.0,1.0
1,1.0,0.314256,0.061484,0.2,0.0
2,1.0,0.396975,0.020901,0.0,1.0
3,1.0,0.314256,0.160574,0.1,0.0
4,1.0,0.248319,0.020495,0.0,1.0
...,...,...,...,...,...
329,1.0,0.274693,0.015412,0.0,1.0
330,1.0,0.274693,0.015265,0.0,1.0
331,1.0,0.604378,0.146862,0.0,1.0
332,1.0,0.314256,0.026350,0.0,1.0


In [None]:

class TitanicCNN(nn.Module):
    def __init__(self, F_in, neurons, F_out):
        super(TitanicCNN, self).__init__()
        # Define the layers
        self.hidden_layer = nn.Linear(F_in, neurons)  # Hidden layer with size 3
        self.output_layer = nn.Linear(neurons, F_out)          # Output layer with size 1
        self.sigmoid = nn.Sigmoid()                 # Sigmoid activation for the output

    def forward(self, x):
        x = torch.relu(self.hidden_layer(x))        # Apply ReLU to hidden layer
        x = self.sigmoid(self.output_layer(x))      # Apply Sigmoid to output layer
        return x


In [None]:
F_in = titanic_train[0][0].shape[0]  # Lấy số lượng đặc trưng từ tập huấn luyện
F_out = 1
neurons = 3
batch_size = 32
lr = 0.001
batch_size_epochs = 50
log_interval = 10
model = TitanicCNN(F_in = F_in, neurons = neurons, F_out = F_out)
optimizer = optim.Adam(model.parameters(), lr)
criterion = nn.BCELoss()


In [None]:
train_losses = []  # Save the loss value of each training loop (epoch) of the neural network model during the training process
train_counter = []  # Save the number of images for training so far
test_losses = []   # Save the loss value of each test loop (epoch) of the neural network model during the training process
test_counter = [i * len(titanic_train) for i in range(batch_size_epochs + 1)]  # how many data for training so far

In [None]:
def train(epoch, model, train_loader, criterion, optimizer, n_epochs=50, log_interval=10):
    model.train()
    correct_pred = 0
    cur_count = 0
    for epoch in range(1, n_epochs + 1):
        epoch_loss = 0.0
        for batch_idx, (features, labels) in enumerate(train_loader):
          # Move data to the correct device (e.g., GPU if available)
          features = features.to(torch.float32)
          labels = labels.to(torch.float32)

          # Reset the gradients
          optimizer.zero_grad()

          # Forward pass
          outputs = model(features)

          # Compute loss
          loss = criterion(outputs, labels)
          # convert to binary data
          pred = (outputs >= 0.5).float()
          correct_pred += (pred == labels).sum().item()
          cur_count += len(labels)

          # Backward pass and optimize
          loss.backward()
          optimizer.step()

          # Accumulate loss for logging
          epoch_loss += loss.item()

      # Log progress after every log_interval epochs
    if batch_idx % log_interval == 0:
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)'.format(
          epoch,
          cur_count,
          len(train_loader),
          100. * cur_count / len(train_loader),
          loss.item(),
          correct_pred, len(train_loader),
          100. * correct_pred / len(train_loader))
      )
      train_losses.append(loss.item())
      train_counter.append((batch_idx * 16) + ((epoch - 1) * len(train_loader)))
    # Return the current accuracy
    return correct_pred / len(train_loader)

In [None]:
def test(model, test_loader, criterion, has_labels=True):
    """
    Evaluate the model on the test dataset.

    Args:
        model: PyTorch model to evaluate.
        test_loader: DataLoader object for test data.
        criterion: Loss function (e.g., nn.BCELoss).
        has_labels: Boolean indicating if the test data has labels.

    Returns:
        test_loss (float): Average loss (if labels are available).
        accuracy (float): Accuracy (if labels are available).
    """
    model.eval()  # Set model to evaluation mode
    test_loss = 0.0
    correct = 0

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            if has_labels:
                data, target = batch  # Unpack data and target
            else:
                data = batch
                target = None

            # Forward pass
            output = model(data)

            # Compute loss if labels are available
            if target is not None:
                test_loss += criterion(output, target).item()

                # Compute accuracy
                pred = (output >= 0.5).float()
                correct += (pred == target).sum().item()

    if has_labels:
        # Average test loss
        test_loss /= len(test_loader)
        accuracy = correct / len(test_loader.dataset)

        # Print results
        print(f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * accuracy:.0f}%)\n")
        return test_loss, accuracy
    else:
        print("Test completed. Predictions generated for unlabeled data.")
        return None


In [None]:
test(model, titanic_test, criterion, False)
train_accuracy_list = []
test_accuracy_list = []
for epoch in range(1, batch_size_epochs + 1): # Indicates the current epoch being run
    train_accuracy_list.append(train(epoch, model, titanic_train, criterion, optimizer, batch_size_epochs, log_interval)) # After each epoch, we use the train() function to train the model
    test_accuracy_list.append(test(model, titanic_test, criterion, False)) # After each epoch, we use the test() function to test the model

Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed. Predictions generated for unlabeled data.
Test completed

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_accuracy_list, color='blue')
plt.plot(test_accuracy_list, color='red')
# plt.ylim(0.5, 1)
plt.legend(['Train Accuracy', 'Test Accuracy', 'Mutli Train Accuracy', 'Mutli Test Accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')