In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def load_data(filepath):
    data = pd.read_csv(filepath)
    data = data.dropna()
    texts = data['Text'].tolist()
    labels = data['Sentiment'].tolist()
    return texts, labels

texts, labels = load_data('../data/reviews.csv')

In [None]:
import re

def clean_text(text):
    text = re.sub(r'http\S+', '', text)          # Remove URLs
    text = re.sub(r'@\w+', '', text)             # Remove mentions
    text = re.sub(r'#\w+', '', text)             # Remove hashtags
    text = re.sub(r'\s+', ' ', text).strip()     # Remove excess whitespace
    return text

# Apply cleaning to your texts
texts = [clean_text(t) for t in texts]

In [None]:
import emoji

def remove_emojis(text):
    return emoji.get_emoji_regexp().sub(u'', text)

# texts = [remove_emojis(t) for t in texts]

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)
        corrected_text.append(corrected_word)
    return ' '.join(corrected_text)

# texts = [correct_spelling(t) for t in texts]

In [None]:
# Split dataset, 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
# Initialize BERT tokenizer and model

# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

model.to(device)

In [None]:
# Tokenize training and testing data
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=50,
    evaluation_strategy='epoch',     # Evaluate at each epoch
    save_strategy='epoch',           # Save model at each epoch
    save_total_limit=1,              # Keep only the last checkpoint
    disable_tqdm=True,               # Disable progress bars to reduce log size
    gradient_accumulation_steps=4,   # Accumulates gradients over 4 steps
)

In [None]:
trainer = Trainer(
    model=model,                         # The instantiated Transformers model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset,           # Evaluation dataset
    compute_metrics=compute_metrics      # Evaluation metrics
)

In [None]:
trainer.train()

In [None]:
eval_result = trainer.evaluate()
print(eval_result)

In [None]:
# Example prediction
def predict(text):
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    encoding = {key: val.to(device) for key, val in encoding.items()}  # Move inputs to GPU
    model.eval()
    with torch.no_grad():
        outputs = model(**encoding)
    logits = outputs.logits
    predicted_class = logits.argmax(-1).item()
    return predicted_class

# Test the prediction
sample_text = "I will blow these things up, not because they're made by these people. Just because they're weird."
print("Predicted class:", predict(sample_text))