# Task 4

### Import of necessary libraries :

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel


  self.pid = os.fork()
100%|██████████| 391/391 [02:10<00:00,  2.99it/s]


Epoch 1/5, Loss: 7.0269


100%|██████████| 391/391 [02:10<00:00,  3.00it/s]


Epoch 2/5, Loss: 1.2878


100%|██████████| 391/391 [02:11<00:00,  2.98it/s]


Epoch 3/5, Loss: 0.5940


100%|██████████| 391/391 [02:10<00:00,  3.00it/s]


Epoch 4/5, Loss: 0.3884


100%|██████████| 391/391 [02:10<00:00,  3.00it/s]


Epoch 5/5, Loss: 0.2878


100%|██████████| 391/391 [00:44<00:00,  8.76it/s]


Training Loss (MSE): 0.1601


100%|██████████| 32/32 [00:04<00:00,  7.56it/s]


### Data preprocessing (Tokenizer, Loader)

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, data_frame, tokenizer, max_length, is_test=False):
        self.data = data_frame
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        title = str(self.data.iloc[index]['title'])
        sentence = str(self.data.iloc[index]['sentence'])
        inputs = self.tokenizer(title + " " + sentence, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        item = {key: value.squeeze(0) for key, value in inputs.items()}
        if not self.is_test:
            item['score'] = torch.tensor(float(self.data.iloc[index]['score']), dtype=torch.float)
        return item


train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test_no_score.csv")

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
max_length = 128

train_dataset = ReviewDataset(train_df, tokenizer, max_length)
test_dataset = ReviewDataset(test_df, tokenizer, max_length, is_test=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=16, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, num_workers=16, pin_memory=True)

### DistilBERT Transformer and MLP Regression Model 

In [None]:
# Model Architecture
class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.distilbert = AutoModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token
        x = self.dropout(pooled_output)
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        output = self.fc3(x)
        return output

### Model Training (Optimiser and criterion setup etc..)

In [None]:
# Setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MyModule().to(DEVICE)

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training Loop
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_loader, total=len(train_loader)):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        targets = batch['score'].unsqueeze(1).to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {epoch_loss:.4f}')

In [None]:
def evaluate(model, loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(loader, total=len(loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['score'].unsqueeze(1).to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

train_loss = evaluate(model, train_loader, criterion, DEVICE)
print(f'Training Loss (MSE): {train_loss:.4f}')

### Prediction of reviews' scores 

In [None]:
# Prediction
def predict(model, loader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(loader, total=len(loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.cpu().squeeze().tolist())
    return predictions

test_predictions = predict(model, test_loader, DEVICE)

# Save Predictions to TXT File
with open("results.txt", "w") as file:
    for prediction in test_predictions:
        file.write(f"{prediction:.2f}\n")
