In [6]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load IMDb dataset
file_path = "~/Documents/FA24/usdjourney/IMDB Dataset.csv"  # Adjust the path accordingly
df = pd.read_csv(file_path)

# Preprocessing the dataset
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Sample a smaller subset of the dataset (e.g., 10%) for faster training
df = df.sample(frac=0.1, random_state=42)

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_data(data, tokenizer, max_length=128):
    return tokenizer(
        data['review'].tolist(),
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

# Tokenize the training and test data with a smaller max_length of 128
train_encodings = tokenize_data(train_df, tokenizer, max_length=128)
test_encodings = tokenize_data(test_df, tokenizer, max_length=128)

# Convert labels to tensors
train_labels = torch.tensor(train_df['label'].values)
test_labels = torch.tensor(test_df['label'].values)

# Create DataLoader for training and testing
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Load the pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Move the model to the CPU
device = torch.device("cpu")
model.to(device)

# Training loop
def train_model(model, train_dataloader, optimizer, device):
    model.train()
    for epoch in range(2):  # Train for 2 epochs to reduce training time
        total_loss = 0
        for batch in train_dataloader:
            batch_input_ids, batch_attention_mask, batch_labels = [b.to(device) for b in batch]
            
            optimizer.zero_grad()
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

# Evaluation function
def evaluate_model(model, test_dataloader, device):
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in test_dataloader:
            batch_input_ids, batch_attention_mask, batch_labels = [b.to(device) for b in batch]
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch_labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}, Recall: {recall}, F1-Score: {f1}")

# Training the model
train_model(model, train_dataloader, optimizer, device)

# Evaluating the model
evaluate_model(model, test_dataloader, device)

# Sample prediction
def predict_sentiment(review, model, tokenizer, device, max_length=128):
    model.eval()
    inputs = tokenizer(review, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment

# Example prediction
review = "What is the likelihood that Elias retains employment with EDF?"
sentiment = predict_sentiment(review, model, tokenizer, device)
print(f"Predicted sentiment: {sentiment}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.4473304640054703
Epoch 2, Loss: 0.2520302815437317
Accuracy: 0.837
Precision: 0.7800338409475466, Recall: 0.9331983805668016, F1-Score: 0.8497695852534562
Predicted sentiment: Negative
