In [125]:
import os
import sys
import torch
sys.path.append(os.path.abspath("../common"))  # add path to common functions
from evaluate import evaluate_predictions
from preprocess import getdfs


In [126]:

train_df, valid_df = getdfs(data = 'text_seq', train_size = 1)

train_df['input_str'] = train_df['input_str'].apply(lambda x : x[3:])
valid_df['input_str'] = valid_df['input_str'].apply(lambda x : x[3:])

In [127]:
num_feat = 47

def get_columns (df) :
    for i in range(num_feat):
        df[f'c_{i}'] = df['input_str'].apply(lambda x : x[i])
    return df.drop(columns = ['input_str'])

train_df = get_columns(train_df)
valid_df = get_columns(valid_df)

In [128]:
from preprocess import one_hot_encode

train_df, valid_df, y_train, y_valid = one_hot_encode(train_df, valid_df)
X_tensor = torch.tensor(train_df.values).float()
y_tensor = torch.tensor(y_train.values).float()

In [129]:
import torch.nn as nn
import torch.optim as optim
import numpy as np


In [130]:
class OneHotNN2(nn.Module):
    def __init__(self):
        super(OneHotNN2, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(467, 512),
            nn.ReLU(),
            nn.Linear(512, 32),
            nn.ReLU(),
        )
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        x = self.linear_relu_stack(x)
        x = torch.sigmoid(self.fc(x))           # Output layer (logits)
        return x
model = OneHotNN2()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Learning rate # HYPERPARAMETER


In [131]:
# Training loop
num_epochs = 20
batch_size = 64

for epoch in range(num_epochs):
    for i in range(0, len(X_tensor), batch_size):
        # Get the batch data
        batch_X = X_tensor[i:i + batch_size]
        batch_y = y_tensor[i:i + batch_size]

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)  # Calculate loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/20], Loss: 0.6960
Epoch [2/20], Loss: 0.6755
Epoch [3/20], Loss: 0.5709
Epoch [4/20], Loss: 0.2948
Epoch [5/20], Loss: 0.1506
Epoch [6/20], Loss: 0.0808
Epoch [7/20], Loss: 0.2754
Epoch [8/20], Loss: 0.1711
Epoch [9/20], Loss: 0.0107
Epoch [10/20], Loss: 0.0073
Epoch [11/20], Loss: 0.0017
Epoch [12/20], Loss: 0.0007
Epoch [13/20], Loss: 0.0005
Epoch [14/20], Loss: 0.0004
Epoch [15/20], Loss: 0.0002
Epoch [16/20], Loss: 0.0001
Epoch [17/20], Loss: 0.0001
Epoch [18/20], Loss: 0.0001
Epoch [19/20], Loss: 0.0000
Epoch [20/20], Loss: 0.0000


In [132]:
# Evaluation
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    correct = 0
    total = len(X_tensor)

    outputs = model(X_tensor)
    for i in outputs:
        if i > 0.5:
            i = 1
        else:
            i = 0
    predicted = (outputs > 0.5).float()  # Apply thresholdts
    for i in range(len(outputs)):
        if (predicted[i] == y_tensor[i]):
            correct += 1

print(f"{correct*100/total:.2f}%")

100.00%


In [133]:
X_test_tensor = torch.tensor(valid_df.values).float()
y_test_tensor = torch.tensor(y_valid.values).float()

model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    correct = 0
    total = len(X_test_tensor)

    outputs = model(X_test_tensor)
    for i in outputs:
        if i > 0.5:
            i = 1
        else:
            i = 0
    predicted = (outputs > 0.5).float()  # Apply thresholdts
    for i in range(len(outputs)):
        if (predicted[i] == y_test_tensor[i]):
            correct += 1

print(f"{correct*100/total:.2f}%")

66.67%


In [134]:
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim

k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

# Number of samples in the dataset
num_samples = X_tensor.shape[0]
# K-fold Cross Validation
results = {}

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_tensor)):
    print(f'Fold {fold + 1}')

    # Split data using indices
    X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
    y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]

    # Define model
    model = OneHotNN2()

    # Define loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    # Training loop
    num_epochs = 20
    for epoch in range(num_epochs):
        model.train()

        # Forward pass
        optimizer.zero_grad()
        outputs = model(X_train).squeeze()
        loss = criterion(outputs, y_train)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()

    # Validation loop
    # model.eval()
    # with torch.no_grad():
    #     val_outputs = model(X_val)
    #     _, predicted = torch.max(val_outputs, 1)
    #     correct = (predicted == y_val).sum().item()
    #     accuracy = correct / len(y_val) * 100
    #     print(f'Fold {fold + 1} Validation Accuracy: {accuracy:.2f}%')

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        correct = 0
        total = len(X_val)

        outputs = model(X_val)
        for i in outputs:
            if i > 0.5:
                i = 1
            else:
                i = 0
        predicted = (outputs > 0.5).float()  # Apply thresholdts
        for i in range(len(outputs)):
            if (predicted[i] == y_val[i]):
                correct += 1

    print(f"Fold {fold + 1} Validation Accuracy: {correct*100/total:.2f}%")
    # Store result for this fold
    results[fold] = correct*100/total

# Average accuracy across all folds
avg_accuracy = sum(results.values()) / len(results)
print(f'Average Validation Accuracy: {avg_accuracy:.2f}%')


Fold 1
Fold 1 Validation Accuracy: 51.69%
Fold 2
Fold 2 Validation Accuracy: 49.51%
Fold 3
Fold 3 Validation Accuracy: 50.49%
Fold 4
Fold 4 Validation Accuracy: 52.90%
Fold 5
Fold 5 Validation Accuracy: 51.34%
Average Validation Accuracy: 51.19%
