In [1]:
#!pip install transformers datasets evaluate scikit-learn
#!pip install -U transformers
#!pip install "numpy<2.0" --quiet
#!pip install torch
#!pip uninstall keras -y
#!pip install tf-keras
#!pip install "accelerate>=0.26.0"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.utils.class_weight import(
   compute_class_weight,
)
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoConfig,
    DataCollatorWithPadding,
    RobertaForSequenceClassification
)
import torch

  from .autonotebook import tqdm as notebook_tqdm
2025-05-31 12:09:30.909072: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

# Load train data
#train_path = "/content/drive/MyDrive/Text Mining/textmining/Project Data-20250507/train.csv"
train_path = "Project Data-20250507/train.csv"  
train_df = pd.read_csv(train_path)

# View shape and features
print("Training data shape:", train_df.shape)
print("\nTraining data columns:", train_df.columns.tolist())

# Display first few rows
print("\nFirst 5 rows of training data:")
train_df.head()

Training data shape: (9543, 2)

Training data columns: ['text', 'label']

First 5 rows of training data:


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [4]:
# Clean text
def preprocess_text(text):
    return text.lower().strip().replace('\n', ' ')

train_df['text'] = train_df['text'].apply(preprocess_text)

# Display first few rows after cleaning
print("\nFirst 5 rows of clean training data:")
train_df.head()


First 5 rows of clean training data:


Unnamed: 0,text,label
0,$bynd - jpmorgan reels in expectations on beyo...,0
1,$ccl $rcl - nomura points to bookings weakness...,0
2,"$cx - cemex cut at credit suisse, j.p. morgan ...",0
3,$ess: btig research cuts to neutral https://t....,0
4,$fnko - funko slides after piper jaffray pt cu...,0


In [5]:
# Split and convert to DatasetDict
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], train_df['label'], test_size=0.2, stratify=train_df['label'], random_state=42
)

dataset = DatasetDict({
    'train': Dataset.from_dict({'text': X_train, 'label': y_train}),
    'validation': Dataset.from_dict({'text': X_val, 'label': y_val})
})

In [6]:
# Define training function
def run_model_trial(model_name, dataset):
    print(f"\n===== Running model: {model_name} =====")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(example):
        return tokenizer(example["text"], padding="max_length", truncation=True)

    tokenized = dataset.map(tokenize_function, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/', '_')}",
        do_train=True,
        do_eval=True,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir=f"./logs/{model_name.replace('/', '_')}",
        logging_steps=10,
        report_to="none"
        )

    trainer = Trainer(
            model=model,
            args=args,
            train_dataset=tokenized["train"],
            eval_dataset=tokenized["validation"]
        )

    trainer.train()

    # Predict
    preds = trainer.predict(tokenized["validation"])
    y_pred = np.argmax(preds.predictions, axis=1)
    y_true = tokenized["validation"]["label"]

    # Report
    report = classification_report(y_true, y_pred, output_dict=True)
    print(classification_report(y_true, y_pred))

    return report


In [7]:
# Run trials for some models
model_names = [
    "bert-base-uncased",
    "distilbert-base-uncased",
    "roberta-base",
    "albert-base-v2",
]

results = {}
for model in model_names:
    results[model] = run_model_trial(model, dataset)



===== Running model: bert-base-uncased =====


Map: 100%|██████████| 7634/7634 [00:06<00:00, 1195.71 examples/s]
Map: 100%|██████████| 1909/1909 [00:01<00:00, 1276.56 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Compare model performances
summary = []
for model, report in results.items():
    summary.append({
        'Model': model,
        'Accuracy': report['accuracy'],
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1': report['weighted avg']['f1-score'],
    })

comparison_df = pd.DataFrame(summary)
comparison_df.sort_values(by="F1", ascending=False).reset_index(drop=True)


In [None]:
# Recombine X_train and y_train
train_df_balanced = pd.DataFrame({'text': X_train, 'label': y_train})

# Separate by class
dfs = [train_df_balanced[train_df_balanced.label == i] for i in train_df_balanced.label.unique()]
max_size = max(len(df) for df in dfs)

# Oversample each class to match the majority
dfs_resampled = [resample(df, replace=True, n_samples=max_size, random_state=42) for df in dfs]
train_df_balanced = pd.concat(dfs_resampled)

# Shuffle and reset
train_df_balanced = train_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Rebuild X/y
X_train_bal = train_df_balanced['text']
y_train_bal = train_df_balanced['label']


In [None]:
# Create datasets
dataset = DatasetDict({
    'train': Dataset.from_dict({'text': X_train_bal, 'label': y_train_bal}),
    'validation': Dataset.from_dict({'text': X_val, 'label': y_val})
})


In [None]:
# Tokenize
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

tokenized = dataset.map(tokenize_function, batched=True)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted"),
    }

# Load model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results/roberta_oversampled",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train!
history = trainer.train()

In [None]:
# Extract training loss and evaluation metrics
training_loss = [log["loss"] for log in history.metrics_history if "loss" in log]
eval_f1 = []
eval_loss = []

for log in trainer.state.log_history:
    if "eval_loss" in log:
        eval_loss.append(log["eval_loss"])
        eval_f1.append(log["eval_f1"])

# Plot
epochs = range(1, len(eval_loss) + 1)
plt.figure(figsize=(12, 5))

# Loss
plt.subplot(1, 2, 1)
plt.plot(epochs, training_loss[:len(eval_loss)], label='Train Loss')
plt.plot(epochs, eval_loss, label='Val Loss')
plt.title("Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# F1 Score
plt.subplot(1, 2, 2)
plt.plot(epochs, eval_f1, marker='o')
plt.title("Validation F1 Score")
plt.xlabel("Epoch")
plt.ylabel("F1 Score")

plt.tight_layout()
plt.show()

In [None]:
# Get predictions and true labels
predictions = trainer.predict(tokenized["validation"])
y_true = predictions.label_ids
y_pred = predictions.predictions.argmax(-1)

# Compute and display confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Bearish", "Bullish", "Neutral"])
disp.plot(cmap="Blues", values_format='d')
plt.title("Confusion Matrix")
plt.show()


In [None]:
!pip install optuna --quiet

In [None]:
import optuna
from transformers import TrainingArguments, Trainer, RobertaForSequenceClassification

# Define Objective Function for Tuning
def model_init():
    return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 4),
    }

# Create base trainer
trainer = Trainer(
    model_init=model_init,
    args=TrainingArguments(
        output_dir="./optuna_run",
        do_train=True,
        do_eval=True,
        logging_steps=10,
        logging_dir="./logs",
        report_to="none",
        disable_tqdm=True,  # reduce clutter
        save_strategy="no" if "save_strategy" in TrainingArguments.__init__.__code__.co_varnames else None  # avoid error
    ),
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Run hyperparameter search
best_run = trainer.hyperparameter_search(
    direction="maximize",  # maximize F1
    hp_space=optuna_hp_space,
    n_trials=10,           # or increase to 20–50 for more thorough tuning
    backend="optuna"
)

# View best run
print("Best trial:")
print(best_run)
