In [38]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [39]:
#!pip install numpy<2.0

Import and Installations

In [40]:
!pip install transformers datasets scikit-learn pandas nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import re
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

Load Pretrained Model and Tokenizer

In [42]:
model_name = "cardiffnlp/twitter-roberta-base-2021-124m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2021-124m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load the data set

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load train data
train_path = "/content/drive/MyDrive/Project Data/train.csv"
#train_path = "Project Data-20250507/train.csv"
train_df = pd.read_csv(train_path)

# train_df = pd.read_csv("Project Data/train.csv")
# test_df = pd.read_csv("Project Data/test.csv")

# View shape and features
print("Training data shape:", train_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())

# Display first few rows
print("\nFirst 5 rows of training data:")
train_df.head()

Training data shape: (9543, 2)

Training data columns: ['text', 'label']

First 5 rows of training data:


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [44]:
# Load datasets
#test_df = pd.read_csv("/content/drive/MyDrive/Project Data/test.csv")

Use the preprocessing suggested by Cardiff NLP (which is optimized for tweets):

In [45]:
# Preprocessing tweets
def preprocess_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  # remove links
    tweet = re.sub(r'@\w+', '@user', tweet)  # anonymize mentions
    tweet = re.sub(r'#', '', tweet)  # remove hashtag symbol
    return tweet.strip()

train_df['text'] = train_df['text'].apply(preprocess_tweet)
test_df['text'] = test_df['text'].apply(preprocess_tweet)

In [46]:
train_df.head()

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral,0
4,$FNKO - Funko slides after Piper Jaffray PT cut,0


# Tokenize and and Model

In [47]:
# Tokenizer and Model
model_name = "cardiffnlp/twitter-roberta-base-2021-124m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-2021-124m and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# Prepare dataset for HuggingFace
train_ds = Dataset.from_pandas(train_df)
train_ds = train_ds.train_test_split(test_size=0.1)

Tokenization

In [49]:
# Tokenization

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize_function, batched=True)
train_ds = train_ds.rename_column("label", "labels")
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


# Convert test_df to Hugging Face Dataset
test_ds = Dataset.from_pandas(test_df)

# Tokenize the test dataset
test_ds = test_ds.map(tokenize_function, batched=True)

# Format the test dataset for PyTorch
# The test dataset does not have a 'label' column, so we don't rename it
# and we don't include 'labels' when setting the format for the test set.
test_ds.set_format("torch", columns=["input_ids", "attention_mask"])


Map: 100%|██████████| 8588/8588 [00:00<00:00, 35056.66 examples/s]
Map: 100%|██████████| 955/955 [00:00<00:00, 32088.89 examples/s]
Map: 100%|██████████| 2388/2388 [00:00<00:00, 41743.93 examples/s]


In [50]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [51]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro"),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro"),
    }

Prepare Model and Train

In [None]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir="./results/twitter_roberta",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8, # with batch = 16 I got 93 for neutral f1 score
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs/twitter_roberta",
    logging_steps=10,
    save_strategy="no",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds["train"],
    eval_dataset=train_ds["test"]  # Using the test split from train_ds for evaluation
    #eval_dataset=test_ds["text"]   #this might be wrong wtf
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [55]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.3384,0.42323
2,0.3918,0.466927
3,0.2224,0.588254


TrainOutput(global_step=3222, training_loss=0.34006300511449683, metrics={'train_runtime': 843.7646, 'train_samples_per_second': 30.535, 'train_steps_per_second': 3.819, 'total_flos': 1694713523586048.0, 'train_loss': 0.34006300511449683, 'epoch': 3.0})

Evaluate

In [56]:
# Evaluate
predictions = trainer.predict(train_ds["test"])
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)
print(classification_report(predictions.label_ids, preds, target_names=["Bearish", "Bullish", "Neutral"]))


              precision    recall  f1-score   support

     Bearish       0.77      0.77      0.77       136
     Bullish       0.83      0.86      0.84       197
     Neutral       0.92      0.91      0.92       622

    accuracy                           0.88       955
   macro avg       0.84      0.85      0.84       955
weighted avg       0.88      0.88      0.88       955



In [57]:
# Evaluate
# predictions = trainer.predict(test_ds)
# preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)
# print(classification_report(predictions.label_ids, preds, target_names=["Bearish", "Bullish", "Neutral"]))

In [58]:

# Final Evaluation on validation set
eval_result = trainer.predict(train_ds["test"])
y_true = eval_result.label_ids
y_pred = np.argmax(eval_result.predictions, axis=1)

print("Validation Set Evaluation Report:")
print(classification_report(y_true, y_pred, target_names=["Bearish", "Bullish", "Neutral"]))




Validation Set Evaluation Report:
              precision    recall  f1-score   support

     Bearish       0.77      0.77      0.77       136
     Bullish       0.83      0.86      0.84       197
     Neutral       0.92      0.91      0.92       622

    accuracy                           0.88       955
   macro avg       0.84      0.85      0.84       955
weighted avg       0.88      0.88      0.88       955



In [59]:

# Predict on test set
tokenized_test = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=128, return_tensors='pt')

# Move tensors to correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
tokenized_test = {k: v.to(device) for k, v in tokenized_test.items()}

# Inference
with torch.no_grad():
    outputs = model(**tokenized_test)
    predictions = torch.argmax(outputs.logits, axis=1)

# Show prediction distribution
unique, counts = np.unique(predictions.cpu().numpy(), return_counts=True)
print("Test Set Prediction Distribution:")
for label, count in zip(unique, counts):
    print(f"Label {label}: {count} samples")


Test Set Prediction Distribution:
Label 0: 371 samples
Label 1: 498 samples
Label 2: 1519 samples
