In [15]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm


# Download NLTK resources if not already present
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r"\s+", " ", text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Load dataset
file_path = 'data/reddit_jokes_slim_processed.csv'  # Update with your file path
df = pd.read_csv(file_path)

# Clean text and split data
df['thread_selftext_clean'] = df['thread_selftext'].apply(clean_text)
df['thread_title_clean'] = df['thread_title'].apply(clean_text)
df['cleaned_text'] = df['thread_title_clean'] + ' ' + df['thread_selftext_clean']

median_value = df['thread_upvote_ratio'].median()
df['binary_class'] = (df['thread_upvote_ratio'] > median_value).astype(int)

X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['binary_class'], test_size=0.2, random_state=42)

# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class JokesDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(label), dtype=torch.long)
        }

# Create dataset objects
max_len = 128
batch_size = 16
train_data = JokesDataset(X_train.to_numpy(), y_train.to_numpy(), tokenizer, max_len=128)
train_loader = DataLoader(train_data, batch_size=16, num_workers=2)
test_data = JokesDataset(X_test.to_numpy(), y_test.to_numpy(), tokenizer, max_len=128)
test_loader = DataLoader(test_data, batch_size=16, num_workers=2)

# Setting device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)


#Setting up training parameters
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training Function
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    tqdm_data_loader = tqdm(data_loader,total = len(val_loader), desc="Training", leave=False)
    #tqdm(val_loader, total = len(val_loader)
    for d in tqdm_data_loader:
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
        tqdm_data_loader.set_postfix(loss=loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

# Evaluation Function
def eval_model(model, data_loader, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    tqdm_data_loader = tqdm(data_loader, desc="Evaluating", leave=False)

    for d in tqdm_data_loader:
        with torch.no_grad():
            for d in data_loader:
                input_ids = d["input_ids"].to(device)
                attention_mask = d["attention_mask"].to(device)
                labels = d["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits
                _, preds = torch.max(logits, dim=1)
                correct_predictions += torch.sum(preds == labels)
                losses.append(loss.item())
        
    return correct_predictions.double() / n_examples, np.mean(losses)


#Training and Evaluation
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
    model, 
    train_loader, 
    optimizer, 
    device, 
    scheduler, 
    len(train_data)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
    model, 
    test_loader, 
    device, 
    len(test_data)
    )
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

y_pred = []
y_true = []

model.eval()
with torch.no_grad():
    for d in test_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        y_pred.extend(preds.detach().cpu().numpy())
        y_true.extend(labels.detach().cpu().numpy())

    print(classification_report(y_true, y_pred))





[nltk_data] Downloading package stopwords to /home/ali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------


                                                  

KeyboardInterrupt: 