In [31]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from data import get_data
from scipy.io import loadmat

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel


In [32]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

# Sample data
data = {
    'sentence': [
        "One of the best silly horror movies of recent memory, with some real shocks in store for unwary viewers.",
        "Jason Patric and Ray Liotta make for one splendidly cast pair.",
        "This is more a case of `Sacre bleu!'",
        "Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.",
        "Beautifully crafted, engaging filmmaking that should attract upscale audiences hungry for quality and a nostalgic, twisty yarn that will keep them guessing.",
        "Bread, My Sweet has so many flaws it would be easy for critics to shred it.",
        "Ultimately feels empty and unsatisfying, like swallowing a Communion wafer without the wine.",
        "Exudes the fizz of a Busby Berkeley musical and the visceral excitement of a sports extravaganza."
    ],
    'label': [1, 4, 2, 3, 4, 2, 3, 1]
}

# Create DataFrame
df = pd.DataFrame(data)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenization and encoding
tokens = tokenizer(df['sentence'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Extract features
with torch.no_grad():
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state[:,0,:].numpy()  # Taking the output of the first token (CLS token)

# Append embeddings to DataFrame
embeddings_df = pd.DataFrame(embeddings)
processed_df = pd.concat([df, embeddings_df], axis=1)

# Output the processed data
print(processed_df.head())


                                            sentence  label         0  \
0  One of the best silly horror movies of recent ...      1 -0.267310   
1  Jason Patric and Ray Liotta make for one splen...      4  0.080934   
2               This is more a case of `Sacre bleu!'      2  0.432747   
3  Presents a good case while failing to provide ...      3 -0.023501   
4  Beautifully crafted, engaging filmmaking that ...      4 -0.178820   

          1         2         3         4         5         6         7  ...  \
0 -0.488523  0.037029  0.158269 -0.156495 -0.319468 -0.075835  0.832508  ...   
1 -0.041667 -0.094109 -0.178179 -0.209157 -0.398069  0.276074  0.265166  ...   
2  0.268934 -0.095764 -0.432659 -0.464035 -0.604674  0.697554  0.361407  ...   
3  0.231426 -0.376311 -0.125749 -0.120390 -0.270657  0.208131  0.349706  ...   
4 -0.415945  0.350926 -0.195871  0.096942 -0.267285  0.101068  0.409448  ...   

        758       759       760       761       762       763       764  \
0 -0.

In [41]:
import csv

def load_and_format_data(file_path):
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header
        data_dict = {'ID': [], 'sentence': [], 'label': []}
        for row in reader:
            if len(row) >= 3:  # Adjust to handle rows with an extra empty element
                # Strip any potential whitespace and ignore empty trailing elements
                cleaned_row = [item.strip() for item in row if item.strip()]
                if len(cleaned_row) == 3:
                    data_dict['ID'].append(cleaned_row[0])
                    data_dict['sentence'].append(cleaned_row[1])
                    data_dict['label'].append(int(cleaned_row[2]))
    return data_dict

# Replace 'file_path' with the path to your CSV file
file_path = './cleaned_data.csv'
data_dictionary = load_and_format_data(file_path)

# Display the dictionary to verify
print(data_dictionary)


{'ID': ['1', '2', '4', '5', '6', '7', '9', '10', '11', '12', '14', '15', '16', '19', '20', '21', '23', '24', '25', '26', '28', '29', '30', '32', '33', '35', '36', '37', '38', '39', '40', '41', '42', '43', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '56', '57', '58', '59', '60', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '75', '76', '77', '78', '79', '80', '81', '82', '83', '85', '86', '87', '88', '89', '90', '92', '93', '95', '96', '97', '98', '99', '100'], 'sentence': ['One of the best silly horror movies of recent memory, with some real shocks in store for unwary viewers.', 'Jason Patric and Ray Liotta make for one splendidly cast pair.', "This is more a case of `Sacre bleu!'", 'Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.', 'Beautifully crafted, engaging filmmaking that should attract upscale audiences hungry for quality and a nostalgic, twisty yarn that will keep th

In [47]:
import torch.nn.functional as F
# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(768, 128)  # Adjust if different number of BERT features
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 4)  # Assuming 4 classes for sentiment

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x
    
class BatchNormNN(nn.Module):
    def __init__(self):
        super(BatchNormNN, self).__init__()
        self.fc1 = nn.Linear(768, 256)
        self.bn1 = nn.BatchNorm1d(256)  # Batch normalization for first layer
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)  # Batch normalization for second layer
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)   # Batch normalization for third layer
        self.fc4 = nn.Linear(64, 4)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))  # Activation after batch norm
        x = F.relu(self.bn2(self.fc2(x)))  # Activation after batch norm
        x = F.relu(self.bn3(self.fc3(x)))  # Activation after batch norm
        x = self.fc4(x)
        return x

class DeeperNN(nn.Module):
    def __init__(self):
        super(DeeperNN, self).__init__()
        self.fc1 = nn.Linear(768, 256)  # First fully connected layer
        self.fc2 = nn.Linear(256, 128)  # Second fully connected layer
        self.fc3 = nn.Linear(128, 64)   # Third fully connected layer
        self.dropout = nn.Dropout(0.5)  # Dropout layer
        self.fc4 = nn.Linear(64, 4)     # Output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))  # Activation function for first layer
        x = F.relu(self.fc2(x))  # Activation function for second layer
        x = F.relu(self.fc3(x))  # Activation function for third layer
        x = self.dropout(x)      # Apply dropout
        x = self.fc4(x)          # Final layer to produce logits
        return x

In [44]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# # Sample data
# data = {
#     'sentence': [
#         "One of the best silly horror movies of recent memory, with some real shocks in store for unwary viewers.",
#         "Jason Patric and Ray Liotta make for one splendidly cast pair.",
#         "This is more a case of `Sacre bleu!'",
#         "Presents a good case while failing to provide a reason for us to care beyond the very basic dictums of human decency.",
#         "Beautifully crafted, engaging filmmaking that should attract upscale audiences hungry for quality and a nostalgic, twisty yarn that will keep them guessing.",
#         "Bread, My Sweet has so many flaws it would be easy for critics to shred it.",
#         "Ultimately feels empty and unsatisfying, like swallowing a Communion wafer without the wine.",
#         "Exudes the fizz of a Busby Berkeley musical and the visceral excitement of a sports extravaganza."
#     ],
#     'label': [1, 4, 2, 3, 4, 2, 3, 1]
# }

df = pd.DataFrame(data_dictionary)

# Adjust labels to be zero-indexed if not already
df['label'] = df['label'] - 1

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenization and encoding
tokens = tokenizer(df['sentence'].tolist(), padding=True, truncation=True, return_tensors="pt")

# Extract features
with torch.no_grad():
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state[:,0,:].numpy()  # Taking the output of the first token (CLS token)

# Convert embeddings and labels to DataFrame
embeddings_df = pd.DataFrame(embeddings)
df = pd.concat([df, embeddings_df], axis=1)
df.drop(['sentence', 'ID'], axis=1, inplace=True)  # Drop 'sentence' and 'ID' if not needed for training

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('label', axis=1), df['label'], test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train.values).float()
X_test = torch.tensor(X_test.values).float()
y_train = torch.tensor(y_train.values).long()
y_test = torch.tensor(y_test.values).long()


Epoch [10/100], Loss: 0.9733
Epoch [20/100], Loss: 0.6216
Epoch [30/100], Loss: 0.3485
Epoch [40/100], Loss: 0.1729
Epoch [50/100], Loss: 0.0852
Epoch [60/100], Loss: 0.0463
Epoch [70/100], Loss: 0.0288
Epoch [80/100], Loss: 0.0202
Epoch [90/100], Loss: 0.0153
Epoch [100/100], Loss: 0.0123
Accuracy: 76.47%


In [None]:
# Instantiate the model, loss function, and optimizer
model1 = SimpleNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)

# Training loop
for epoch in range(100):  # Adjust number of epochs as needed
    optimizer.zero_grad()
    outputs = model1(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

# Testing the model
model1.eval()
with torch.no_grad():
    outputs = model1(X_test)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = accuracy_score(y_test.numpy(), predicted.numpy())
    print(f'Accuracy: {accuracy * 100:.2f}%')

In [48]:
model2 = BatchNormNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=0.001)

# Training loop
for epoch in range(100):  # Adjust number of epochs as needed
    optimizer.zero_grad()
    outputs = model2(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

# Testing the model
model2.eval()
with torch.no_grad():
    outputs = model2(X_test)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = accuracy_score(y_test.numpy(), predicted.numpy())
    print(f'Accuracy: {accuracy * 100:.2f}%')

Epoch [10/100], Loss: 0.2877
Epoch [20/100], Loss: 0.1555
Epoch [30/100], Loss: 0.0939
Epoch [40/100], Loss: 0.0615
Epoch [50/100], Loss: 0.0435
Epoch [60/100], Loss: 0.0326
Epoch [70/100], Loss: 0.0257
Epoch [80/100], Loss: 0.0209
Epoch [90/100], Loss: 0.0175
Epoch [100/100], Loss: 0.0149
Accuracy: 70.59%


In [49]:
model3 = BatchNormNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model3.parameters(), lr=0.001)

# Training loop
for epoch in range(100):  # Adjust number of epochs as needed
    optimizer.zero_grad()
    outputs = model3(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    # Print loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

# Testing the model
model3.eval()
with torch.no_grad():
    outputs = model3(X_test)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = accuracy_score(y_test.numpy(), predicted.numpy())
    print(f'Accuracy: {accuracy * 100:.2f}%')

Epoch [10/100], Loss: 0.3061
Epoch [20/100], Loss: 0.1670
Epoch [30/100], Loss: 0.1011
Epoch [40/100], Loss: 0.0663
Epoch [50/100], Loss: 0.0467
Epoch [60/100], Loss: 0.0351
Epoch [70/100], Loss: 0.0276
Epoch [80/100], Loss: 0.0226
Epoch [90/100], Loss: 0.0189
Epoch [100/100], Loss: 0.0161
Accuracy: 64.71%
