# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import pickle
import random
import pandas as pd
from datasets import load_dataset
from torch.utils.data import SubsetRandomSampler
from sklearn.metrics import classification_report

# NeuralNetwork

In [2]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x


def convert_to_ground_truth(predictions, threshold=0.5):
    return (predictions > threshold).float()

# Load dataset and embeddings

### Rotten Tomatoes

In [9]:
with open('./../data/embeddings/tomato-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/tomato-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)


# NaiveGreedy
# LazyGreedy
# StochasticGreedy
# LazierThanLazyGreedy

with open('./../data/tomato_GC_LazyGreedy.pkl', 'rb') as file:
    submodlib_data = pickle.load(file)

dataset_name = "rotten_tomatoes"
dataset = load_dataset(dataset_name)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [4]:
len(submodlib_data)

6397

### OLID Dataset

In [5]:
with open('./../data/embeddings/OLID-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/OLID-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)


# NaiveGreedy
# LazyGreedy
# StochasticGreedy
# LazierThanLazyGreedy

with open('./../data/olid_DM_LazyGreedy.pkl', 'rb') as file:
    submodlib_data = pickle.load(file)

dataset_name = "christophsonntag/OLID"
dataset = load_dataset(dataset_name)

train_dataset_1 = dataset["train"]
test_dataset_1 = dataset["test"]

In [6]:
# test_dataset['text'] = test_dataset['cleaned_tweet']
train_dataset, test_dataset = {}, {}
test_dataset['label'] = [1 if tweet=='OFF' else 0 for tweet in test_dataset_1['subtask_a']]
test_dataset['text'] = test_dataset_1['cleaned_tweet']

train_dataset['label'] = [1 if tweet=='OFF' else 0 for tweet in train_dataset_1['subtask_a']]
train_dataset['text'] = train_dataset_1['cleaned_tweet']

In [28]:
from collections import Counter

words = ['a', 'b', 'c', 'a']

Counter(train_dataset['label']).keys() # equals to list(set(words))
Counter(train_dataset['label']).values()

dict_values([4400, 8840])

### IMDB

In [None]:
with open('./../data/embeddings/imdb-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/imdb-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset_name = "stanfordnlp/imdb"
dataset = load_dataset(dataset_name)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

### GLUE COLA

In [10]:
with open('./../data/embeddings/cola-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/cola-val-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/cola-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset = load_dataset("glue", "cola")

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

test_dataset = dataset["validation"]

Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'cola' at /data/BADRI/.cache/huggingface/datasets/glue/cola/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Sun Apr 21 18:05:59 2024).


### GLUE SST2

In [3]:
with open('./../data/embeddings/sst2-test-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/sst2-val-768.pkl', 'rb') as file:
    test_x = pickle.load(file)

with open('./../data/embeddings/sst2-train-768.pkl', 'rb') as file:
    train_x = pickle.load(file)

dataset = load_dataset("glue", "sst2")

train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

test_dataset = dataset["validation"]

Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'sst2' at /data/BADRI/.cache/huggingface/datasets/glue/sst2/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Wed Apr 17 20:15:20 2024).


In [4]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

### Entire Data

In [None]:
torch.manual_seed(0)
test_x = torch.squeeze(torch.stack([torch.tensor(arr) for arr in test_x]))
train_x = torch.squeeze(torch.stack([torch.tensor(arr) for arr in train_x]))
train_y = torch.squeeze(torch.tensor(train_dataset['label'], dtype=torch.float32))
test_y = torch.squeeze(torch.tensor(test_dataset['label'], dtype=torch.float32))

In [None]:
# percentages = [0.05, 0.1, 0.25, 0.4, 0.5, 0.6, 0.75, 0.8, 1]
percentages = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
reports = []
empty_df = pd.DataFrame()

for percentage in percentages:


    subset_train_x, subset_train_y = [], []
    subset_data_val = int(percentage*len(train_x))
    for index,score in submodlib_data[:subset_data_val]:
        subset_train_x.append(train_x[index])
        subset_train_y.append(train_y[index])

    subset_train_x = torch.squeeze(torch.stack([torch.tensor(arr) for arr in subset_train_x]))
    subset_train_y = torch.squeeze(torch.tensor(subset_train_y, dtype=torch.float32))
    

    hidden_size = 4 
    output_size = 1
    epochs = 5000


    input_size = subset_train_x.shape[1]
    model = SimpleNN(input_size, hidden_size, output_size)
    
    # Define loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    # Training loop
    
    for epoch in range(epochs):
        # Forward pass
        outputs = torch.squeeze(model(subset_train_x))
        loss = criterion(outputs, subset_train_y)
    
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        # Print loss every 1000 epochs
        if epoch % 1000 == 0:
            print(f'Epoch {epoch}: Loss {loss.item()}')

    predictions = convert_to_ground_truth(model(test_x))
    report = classification_report(test_y.detach().numpy(), predictions.detach().numpy(), output_dict=True)
    df = pd.DataFrame(report).transpose()
    reports.append(df)

In [4]:
result_df = pd.concat(reports)
result_df = result_df.round(2)
result_df.to_csv("olid_disparitysum.csv")

NameError: name 'reports' is not defined