In [3]:
# 1. Imports
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import kagglehub

In [4]:
# 2. Load Dataset from KaggleHub
path = kagglehub.dataset_download("smagnan/1-million-reddit-comments-from-40-subreddits")

# Load the CSV file
df = pd.read_csv(f"{path}/kaggle_RC_2019-05.csv", low_memory=False)

In [5]:
# Sample 10,000 comments for faster training
df = df.sample(n=30000, random_state=42).reset_index(drop=True)

# Print dataset shape and head
print(df.shape)
print(df.head())

(30000, 4)
        subreddit                                               body  \
0     apexlegends  How ironic that you're being indignant on the ...   
1  ChapoTrapHouse  I started work in 99 when the boomers we're st...   
2        Market76  Any combination of the following:\n\nAAE Pump ...   
3       worldnews  Compare pharma's marketing budget to it's R&am...   
4       worldnews                Wasn't it an illegitimate election?   

   controversiality  score  
0                 0      0  
1                 0      2  
2                 0      1  
3                 0      1  
4                 0      2  


In [6]:
# 3. Convert score into class labels
def convert_score(score):
    if score < 1:
        return 0  # Negative
    elif score == 1:
        return 1  # Neutral
    else:
        return 2  # Positive


In [7]:
df['label'] = df['score'].apply(convert_score)

In [8]:
# 4. Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['body'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# 5. Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Important for GPT-2


In [9]:
# 6. Define custom Dataset
class RedditDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [10]:
# 7. Create dataset objects
train_dataset = RedditDataset(train_texts, train_labels, tokenizer)
val_dataset = RedditDataset(val_texts, val_labels, tokenizer)


In [15]:
# 8. Load GPT-2 for Sequence Classification
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)
model.config.pad_token_id = model.config.eos_token_id  # Important for GPT-2

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()  # Set model to training mode

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

In [14]:

# 9. Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none"
    # Removed evaluation_strategy and save_strategy for compatibility
)

In [16]:
# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:

# 11. Train the model
trainer.train()

# 12. Evaluate the model
metrics = trainer.evaluate()
print(metrics)

Step,Training Loss


In [None]:
from sklearn.metrics import classification_report

# Predict on validation set
predictions = trainer.predict(val_dataset)

# Extract logits and true labels
logits = predictions.predictions
y_pred = logits.argmax(axis=1)
y_true = predictions.label_ids

# # Print full classification report
# print("\nClassification Report:\n")
# print(classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))


import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# ... (Your existing code to generate classification_report) ...

# Get the classification report as a dictionary
report_dict = classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"], output_dict=True)

# Extract relevant metrics
metrics = ['precision', 'recall', 'f1-score']
classes = ['Negative', 'Neutral', 'Positive']
data = np.array([[report_dict[c][m] for m in metrics] for c in classes])

# Create the plot
fig, ax = plt.subplots(figsize=(10, 5))
width = 0.2  # Width of each bar
x = np.arange(len(classes))

# Plot bars for each metric
rects1 = ax.bar(x - width, data[:, 0], width, label='Precision')
rects2 = ax.bar(x, data[:, 1], width, label='Recall')
rects3 = ax.bar(x + width, data[:, 2], width, label='F1-score')

# Set labels, title, and legend
ax.set_ylabel('Scores')
ax.set_title('Classification Report')
ax.set_xticks(x)
ax.set_xticklabels(classes)
ax.legend()

# Add value labels on top of bars
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Function to predict sentiment from user input
def predict_user_input(text, model, tokenizer, device):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()

    label_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_mapping[pred]

# Take input from user
user_text = input("\nEnter a comment for sentiment prediction: ")

#  Predict label
predicted_label = predict_user_input(user_text, model, tokenizer, device)

# Print output
print(f"\nPredicted Sentiment: {predicted_label}")
