In [None]:
# Import required libraries
import matplotlib.pyplot as plt
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
# Load your training CSV file
train_csv_path = 'train.csv' 
train_df = pd.read_csv(train_csv_path)

# Load your testing CSV file
test_csv_path = 'test.csv'
test_df = pd.read_csv(test_csv_path)

In [None]:
label_encoding = {'negative': 0, 'neutral': 1, 'positive': 2}

In [None]:
# Define a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, sentiments):
        self.texts = texts
        self.sentiments = sentiments
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        sentiment = str(self.sentiments[idx])
        encoding = self.tokenizer(
            text,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'sentiment': torch.tensor(label_encoding[sentiment])  # Convert sentiment to tensor
        }

In [None]:
# Create the training dataset
train_dataset = SentimentDataset(train_df['text'], train_df['sentiment'])
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Create the testing dataset
test_dataset = SentimentDataset(test_df['text'], test_df['sentiment'])
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
# Load or define your tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Define the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_df['sentiment'])))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
# Training loop
num_epochs = 2

# If your using Apple's Metal, use the following line
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# If your using CUDA, use the following line
# device = torch.device('cuda' if torch.backends.mps.is_available() else 'cpu')

model.to(device)

In [None]:
# Set up DataLoader
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=20, shuffle=True)

In [None]:
# Lists to store training and test metrics for each epoch
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

# Training loop
for epoch in range(num_epochs):
    
    # Train the model
    model.train()
    train_loss = 0
    correct_predictions_train = 0
    total_samples_train = 0

    for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch + 1}/{num_epochs}'):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'sentiment'}
        labels = batch['sentiment'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        _, predicted = torch.max(outputs.logits, 1)
        correct_predictions_train += (predicted == labels).sum().item()
        total_samples_train += labels.size(0)

        loss.backward()
        optimizer.step()

    # Calculate and append training metrics
    train_accuracy = correct_predictions_train / total_samples_train
    train_loss /= len(train_dataloader)
    train_accuracies.append(train_accuracy)
    train_losses.append(train_loss)

    
    # Evaluate on the test dataset
    model.eval()
    test_loss = 0
    correct_predictions_test = 0
    total_samples_test = 0

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc=f'Testing Epoch {epoch + 1}/{num_epochs}'):
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'sentiment'}
            labels = batch['sentiment'].to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            test_loss += loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            correct_predictions_test += (predicted == labels).sum().item()
            total_samples_test += labels.size(0)

    # Calculate and append test metrics
    test_accuracy = correct_predictions_test / total_samples_test
    test_loss /= len(test_dataloader)
    test_accuracies.append(test_accuracy)
    test_losses.append(test_loss)

    # Print training and test metrics
    print(f'Epoch {epoch + 1}/{num_epochs}:')
    print(f'Training Accuracy: {train_accuracy}, Training Loss: {train_loss}')
    print(f'Test Accuracy: {test_accuracy}, Test Loss: {test_loss}')

# Plot training and test metrics
epochs = range(1, num_epochs + 1)

plt.figure(figsize=(12, 5))

# Plot training and test loss
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, test_losses, label='Test Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Test Loss')
plt.legend()

# Plot training and test accuracy
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, test_accuracies, label='Test Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Test Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Save the trained model and tokenizer
model.save_pretrained('location/model_name')
tokenizer.save_pretrained('location/tokenizer_name')