# Imports

In [117]:
import torch
import torch.nn as nn
import torch.optim as optim

import pickle
import random
from datasets import load_dataset
from torch.utils.data import SubsetRandomSampler

# NeuralNetwork

In [118]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x


def convert_to_ground_truth(predictions, threshold=0.5):
    return (predictions > threshold).float()

# Load dataset and embeddings

### IMDB

In [119]:
with open('./../data/embeddings/imdb-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/imdb-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset_name = "stanfordnlp/imdb"
dataset = load_dataset(dataset_name)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

### GLUE COLA

In [31]:
with open('./../data/embeddings/cola-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/cola-val-768.pkl', 'rb') as file:
    val_x = pickle.load(file)

with open('./../data/embeddings/cola-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset = load_dataset("glue", "cola")

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

test_dataset = dataset["validation"]

### GLUE SST2

In [88]:
with open('./../data/embeddings/sst2-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/sst2-val-768.pkl', 'rb') as file:
    val_x = pickle.load(file)

with open('./../data/embeddings/sst2-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset = load_dataset("glue", "sst2")

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

test_dataset = dataset["validation"]

### Entire Data

In [120]:
torch.manual_seed(0)
test_x = torch.squeeze(torch.stack([torch.tensor(arr) for arr in test_x]))
train_x = torch.squeeze(torch.stack([torch.tensor(arr) for arr in train_x]))
train_y = torch.squeeze(torch.tensor(train_dataset['label'], dtype=torch.float32))
test_y = torch.squeeze(torch.tensor(test_dataset['label'], dtype=torch.float32))

### Get Percentage of Data

In [124]:
percentage = 1  # 10% of the dataset


# Generate indices for splitting the dataset
dataset_size = len(train_x)
indices = list(range(dataset_size))
split = int(percentage * dataset_size)
random.shuffle(indices)
train_indices = indices[:split]
subset_train_x = train_x[train_indices]
subset_train_y = train_y[train_indices]

In [125]:
subset_train_x.shape

torch.Size([25000, 768])

### Hyperparameters

In [126]:
hidden_size = 4 
output_size = 1
epochs = 5000

### Train Code

In [127]:
input_size = subset_train_x.shape[1]
model = SimpleNN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Training loop

for epoch in range(epochs):
    # Forward pass
    outputs = torch.squeeze(model(subset_train_x))
    loss = criterion(outputs, subset_train_y)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print loss every 1000 epochs
    if epoch % 1000 == 0:
        print(f'Epoch {epoch}: Loss {loss.item()}')

Epoch 0: Loss 0.6940603852272034
Epoch 1000: Loss 0.33644992113113403
Epoch 2000: Loss 0.28721025586128235
Epoch 3000: Loss 0.27277350425720215
Epoch 4000: Loss 0.265116423368454


In [128]:
predictions = convert_to_ground_truth(model(test_x))

In [129]:
from sklearn.metrics import classification_report

# Generate classification report

report = classification_report(test_y.detach().numpy(), predictions.detach().numpy())

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.89      0.89     12500
         1.0       0.89      0.90      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



In [81]:
x = [1,2,3,4,5,6,7]

In [82]:
x[3:]

[4, 5, 6, 7]

In [83]:
x[:3]

[1, 2, 3]

In [84]:
x[:-3]

[1, 2, 3, 4]