In [7]:
# %%
import pandas as pd 
# Load preprocessed data
train_df = pd.read_csv("train_data_final_2.csv")
test_df = pd.read_csv("test_data_final_2.csv")

# %%
# Inspect the first few rows
print("Training Data:")
print(train_df.head())

print("\nTest Data:")
print(test_df.head())


Training Data:
                                    tokenized_review  label  \
0  ['bromwell', 'high', 'cartoon', 'comedy', 'ran...      1   
1  ['homelessness', 'houselessness', 'george', 'c...      1   
2  ['brilliant', 'overacting', 'lesley', 'ann', '...      1   
3  ['easily', 'underrated', 'film', 'inn', 'brook...      1   
4  ['typical', 'mel', 'brooks', 'film', 'slapstic...      1   

                                       padded_review  
0  [1, 7323, 2274, 2956, 12444, 15823, 12104, 136...  
1  [1, 1, 6465, 1, 14855, 8270, 17417, 11633, 724...  
2  [1864, 11048, 1, 580, 16961, 1411, 4634, 1, 87...  
3  [4836, 16350, 5810, 7999, 1904, 2164, 15282, 5...  
4  [16264, 9793, 1904, 5810, 14290, 10269, 151, 1...  

Test Data:
                                    tokenized_review  label  \
0  ['went', 'saw', 'movie', 'night', 'coaxed', 'f...      1   
1  ['actor', 'turned', 'director', 'bill', 'paxto...      1   
2  ['recreational', 'golfer', 'knowledge', 'sport...      1   
3  ['saw', '

In [1]:

# %%
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import ast  # For safely evaluating strings containing Python literals
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

vocab_size = 17502  # Size of the vocabulary
vocab_size = min(vocab_size, 10000)  # Reduce vocab size to 15,000


def index_to_one_hot(index, vocab_size):
    """ Convert an index to a one-hot encoded vector """
    one_hot = torch.zeros(vocab_size)
    one_hot[index] = 1
    return one_hot

class SentimentDataset(Dataset):
    def __init__(self, dataframe):
        """
        Args:
            dataframe (pd.DataFrame): Dataframe containing 'padded_review' and 'label'.
        """
        self.reviews = dataframe['padded_review']
        self.labels = dataframe['label']

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        indices = self.reviews.iloc[idx]
        # Ensure indices are within the vocab_size range by clamping
        indices = [min(index, vocab_size - 1) for index in indices]
        one_hot_encoded = torch.stack([index_to_one_hot(index, vocab_size) for index in indices])
        return one_hot_encoded, self.labels.iloc[idx]



train_df = pd.read_csv('train_data_final.csv')
train_df = train_df.sample(n=400, random_state=42)  # Sampling 1000 examples due to dataset size

test_df = pd.read_csv('train_data_final.csv')
test_df = test_df.sample(n=200, random_state=42)  # Sampling 1000 examples due to dataset size


# Splitting data into train, dev, and test sets
# train_data, test_data = train_test_split(test_df, test_size=0.2, random_state=42)
train_data, dev_data = train_test_split(train_df, test_size=0.2, random_state=42)

test_data = test_df

# Truncate the sequences to max_length within the DataFrame
max_length = 105  

# Apply truncation directly to the 'padded_review' column in each DataFrame
train_df['padded_review'] = train_df['padded_review'].apply(lambda review: ast.literal_eval(review)[:max_length])
dev_df = train_df.sample(frac=0.1, random_state=42)  # Extracting dev data from train_df
train_df = train_df.drop(dev_df.index)  # Remaining for training
test_df['padded_review'] = test_df['padded_review'].apply(lambda review: ast.literal_eval(review)[:max_length])

# Ensure labels are numeric (convert if necessary)
train_df['label'] = pd.to_numeric(train_df['label'])
dev_df['label'] = pd.to_numeric(dev_df['label'])
test_df['label'] = pd.to_numeric(test_df['label'])

# Create datasets
train_dataset = SentimentDataset(train_df)
dev_dataset = SentimentDataset(dev_df)
test_dataset = SentimentDataset(test_df)

# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Print data loader counts
print(f"Number of batches in training loader: {len(train_loader)}")
print(f"Number of batches in dev loader: {len(dev_loader)}")
print(f"Number of batches in test loader: {len(test_loader)}")



Number of batches in training loader: 45
Number of batches in dev loader: 5
Number of batches in test loader: 25


In [2]:
# Print a dataloader batch
for X_batch, y_batch in train_loader:
    print("Batch X shape:", X_batch.shape)  # Shape of the batch
    print("Batch Y shape:", y_batch.shape)  # Shape of the labels
    print("Batch X content:\n", X_batch)    # Content of one-hot encoded vectors
    print("Batch Y content:", y_batch)      # Content of labels
    break  # Break after printing the first batch to avoid printing multiple batches

Batch X shape: torch.Size([8, 105, 10000])
Batch Y shape: torch.Size([8])
Batch X content:
 tensor([[[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         ...,
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.]],

        [[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         ...,
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.]],

        [[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.]],

        ...,

        [[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ...

In [19]:
import torch

# Clear GPU memory
torch.cuda.empty_cache()
torch.cuda.empty_cache()


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import copy


class FeedForwardNN(nn.Module):
    def __init__(self, input_size, hidden1_size, hidden2_size, output_size):
        super(FeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.relu2 = nn.ReLU()
        self.output = nn.Linear(hidden2_size, output_size)
        self.sigmoid = nn.Sigmoid()  # For binary classification

    def forward(self, x):
        # x shape: [batch_size, 100, 2000]
        # Flatten the input if necessary
        x = x.view(x.size(0), -1)  # [batch_size, 100*2000] = [batch_size, 200000]
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.output(out)
        out = self.sigmoid(out).squeeze()
        return out



# Parameters
input_size = 105 * 10000
hidden1_size = 256
hidden2_size = 128
output_size = 1  # Binary classification
learning_rate = 0.001
num_epochs = 10

# Initialize the model
model = FeedForwardNN(input_size, hidden1_size, hidden2_size, output_size)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [4]:
best_dev_accuracy = 0.0
best_model_state = copy.deepcopy(model.state_dict())

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X, batch_Y in train_loader:
        batch_X = batch_X.to(device)
        batch_Y = batch_Y.to(device).float()
        print("Here")
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_Y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * batch_X.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)

    # Evaluation on development set
    model.eval()
    dev_preds = []
    dev_labels = []
    with torch.no_grad():
        for batch_X, batch_Y in dev_loader:
            batch_X = batch_X.to(device)
            batch_Y = batch_Y.to(device).float()
            outputs = model(batch_X)
            preds = (outputs >= 0.5).long()
            dev_preds.extend(preds.cpu().numpy())
            dev_labels.extend(batch_Y.cpu().numpy())

    dev_accuracy = accuracy_score(dev_labels, dev_preds)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Dev Accuracy: {dev_accuracy:.4f}")

    # Save the model if dev accuracy improves
    if dev_accuracy > best_dev_accuracy:
        best_dev_accuracy = dev_accuracy
        best_model_state = copy.deepcopy(model.state_dict())

# Load the best model
model.load_state_dict(best_model_state)


Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Epoch [1/10], Loss: 0.6950, Dev Accuracy: 0.4750
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Epoch [2/10], Loss: 0.6721, Dev Accuracy: 0.5250
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Epoch [3/10], Loss: 0.6108, Dev Accuracy: 0.5250
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here
Here


KeyboardInterrupt: 

In [5]:
model.eval()
test_preds = []
test_labels = []
with torch.no_grad():
    for batch_X, batch_Y in test_loader:
        batch_X = batch_X.to(device)
        batch_Y = batch_Y.to(device).float()
        outputs = model(batch_X)
        preds = (outputs >= 0.5).long()
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(batch_Y.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-Score: {f1:.4f}")

# Detailed metrics for each class
precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
    test_labels, test_preds, labels=[0,1]
)

print("\nPer Class Metrics:")
print(f"Class 0 - Precision: {precision_per_class[0]:.4f}, Recall: {recall_per_class[0]:.4f}, F1-Score: {f1_per_class[0]:.4f}")
print(f"Class 1 - Precision: {precision_per_class[1]:.4f}, Recall: {recall_per_class[1]:.4f}, F1-Score: {f1_per_class[1]:.4f}")


Test Accuracy: 0.6500
Test Precision: 0.6923
Test Recall: 0.6000
Test F1-Score: 0.6429

Per Class Metrics:
Class 0 - Precision: 0.6147, Recall: 0.7053, F1-Score: 0.6569
Class 1 - Precision: 0.6923, Recall: 0.6000, F1-Score: 0.6429


In [6]:
# Define the checkpoint path
checkpoint_path = 'best_ffn_model.pth'

# Save the best model
torch.save(best_model_state, checkpoint_path)
print(f"Best model saved to {checkpoint_path}")


Best model saved to best_ffn_model.pth
