# Imports

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim

import pickle
import random
from datasets import load_dataset
from torch.utils.data import SubsetRandomSampler

# NeuralNetwork

In [20]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x


def convert_to_ground_truth(predictions, threshold=0.5):
    return (predictions > threshold).float()

# Load dataset and embeddings

### IMDB

In [21]:
with open('./../data/embeddings/imdb-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/imdb-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset_name = "stanfordnlp/imdb"
dataset = load_dataset(dataset_name)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

### GLUE COLA

In [None]:
with open('./../data/embeddings/cola-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/cola-val-768.pkl', 'rb') as file:
    val_x = pickle.load(file)

with open('./../data/embeddings/cola-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset = load_dataset("glue", "cola")

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

### Entire Data

In [22]:
torch.manual_seed(0)
test_x = torch.squeeze(torch.stack([torch.tensor(arr) for arr in test_x]))
train_x = torch.squeeze(torch.stack([torch.tensor(arr) for arr in train_x]))
train_y = torch.squeeze(torch.tensor(train_dataset['label'], dtype=torch.float32))
test_y = torch.squeeze(torch.tensor(test_dataset['label'], dtype=torch.float32))

### Get Percentage of Data

In [23]:
percentage = 0.05  # 10% of the dataset


# Generate indices for splitting the dataset
dataset_size = len(train_x)
indices = list(range(dataset_size))
split = int(percentage * dataset_size)
random.shuffle(indices)

# Split the indices into train and test indices
train_indices, _ = indices[split:], indices[:split]

# Create SubsetRandomSampler for train set
train_sampler = SubsetRandomSampler(train_indices)

# Create DataLoader for train set
train_loader = torch.utils.data.DataLoader(dataset, sampler=train_sampler)


# Choose random % of dataset
subset_size = int(len(train_x) * percentage)
chosen_indices = torch.randperm(len(train_x))[:subset_size]

# Select subset of data
subset_train_x = train_x[chosen_indices]
subset_train_y = train_y[chosen_indices]

In [24]:
subset_train_x.shape

torch.Size([1250, 768])

### Hyperparameters

In [25]:
hidden_size = 4 
output_size = 1

### Train Code

In [26]:
input_size = subset_train_x.shape[1]
model = SimpleNN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Training loop
epochs = 10000
for epoch in range(epochs):
    # Forward pass
    outputs = torch.squeeze(model(subset_train_x))
    loss = criterion(outputs, subset_train_y)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss every 1000 epochs
    if epoch % 1000 == 0:
        print(f'Epoch {epoch}: Loss {loss.item()}')

Epoch 0: Loss 0.7260202169418335
Epoch 1000: Loss 0.6848580837249756
Epoch 2000: Loss 0.6094982028007507
Epoch 3000: Loss 0.42271894216537476
Epoch 4000: Loss 0.32491835951805115
Epoch 5000: Loss 0.2789395749568939
Epoch 6000: Loss 0.2503441274166107
Epoch 7000: Loss 0.22949931025505066
Epoch 8000: Loss 0.2131308764219284
Epoch 9000: Loss 0.19974397122859955


In [9]:
predictions = convert_to_ground_truth(model(test_x))

In [10]:
from sklearn.metrics import classification_report

# Generate classification report

report = classification_report(test_y.detach().numpy(), predictions.detach().numpy())

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.88      0.88     12500
         1.0       0.88      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

