In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import re

# Load and preprocess the data
df = pd.read_csv('data/reddit_jokes_slim_processed.csv')



# Basic preprocessing and cleaning
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)  # Remove special characters
    return text

# Apply preprocessing to the dataset
df['thread_title'] = df['thread_title'].apply(preprocess_text)
df['thread_selftext'] = df['thread_selftext'].apply(preprocess_text)
df['text'] = df['thread_title'] + " " + df['thread_selftext']
df['upvote_ratio'] = pd.qcut(df['thread_upvote_ratio'], 10, labels=False, duplicates='drop')

class JokesDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        labels = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# Hyperparameters
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 3

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['upvote_ratio'], test_size=0.1, random_state=42)
# Reset index after train-test split
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

# Your existing JokesDataset class and other code remain the same


# Create datasets
train_dataset = JokesDataset(X_train, y_train, tokenizer, MAX_LENGTH)
val_dataset = JokesDataset(X_val, y_val, tokenizer, MAX_LENGTH)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=10)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"  # Evaluate at the end of each epoch
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=1)
print(classification_report(y_val, predictions))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                        
  0%|          | 0/6282 [11:01<?, ?it/s]            

{'loss': 1.521, 'learning_rate': 5e-05, 'epoch': 0.24}


                                        
  0%|          | 0/6282 [18:27<?, ?it/s]             

{'loss': 0.0, 'learning_rate': 4.567623659633345e-05, 'epoch': 0.48}


                                        
  0%|          | 0/6282 [25:59<?, ?it/s]             

{'loss': 0.0, 'learning_rate': 4.13524731926669e-05, 'epoch': 0.72}


                                        
  0%|          | 0/6282 [33:37<?, ?it/s]             

{'loss': 0.0, 'learning_rate': 3.7028709789000346e-05, 'epoch': 0.96}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                        

[A[A                           

{'eval_loss': nan, 'eval_runtime': 36.6881, 'eval_samples_per_second': 101.45, 'eval_steps_per_second': 6.351, 'epoch': 1.0}


                                        
  0%|          | 0/6282 [42:23<?, ?it/s]           

{'loss': 0.0, 'learning_rate': 3.27049463853338e-05, 'epoch': 1.19}


                                        
  0%|          | 0/6282 [50:27<?, ?it/s]           

{'loss': 0.0, 'learning_rate': 2.838118298166724e-05, 'epoch': 1.43}


