In [None]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN") # Replace HF_TOKEN with your secret name
login(token=hf_token)

In [None]:
%pip install --upgrade transformers

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import os
import wandb
%wandb login 46c9dd6a8552e990c5901e2d4c0425f5b9e11232
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
MODEL_NAME = "yiyanghkust/finbert-tone"
DATA_PATH = "/kaggle/working/stock-market-predictive-analysis/data/news_sentiment/labeled_news.csv"  # path to your CSV
OUTPUT_DIR = "./finbert_finetuned"
NUM_EPOCHS = 10
BATCH_SIZE = 8
LR = 1e-5
MAX_LEN = 128
PROJECT_NAME = "finbert-finetune"  # wandb project name

In [None]:
wandb.init(project=PROJECT_NAME, name="finbert-local-kaggle", config={
    "epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LR,
    "max_length": MAX_LEN
})

In [None]:
wandb.init(project=PROJECT_NAME, name="finbert-local-kaggle", config={
    "epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LR,
    "max_length": MAX_LEN
})

In [None]:
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["News", "PrevClose_Label"])

# Label mapping
label_map = {label: i for i, label in enumerate(df["PrevClose_Label"].unique())}
num_labels = len(label_map)
df["label"] = df["PrevClose_Label"].map(label_map)

# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)


def tokenize(batch):
    return tokenizer(batch["News"], padding="max_length", truncation=True, max_length=MAX_LEN)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# Remove unused columns
remove_cols = ["Company", "Date", "News", "Intraday_Label", "Intraday_Change(%)"]
train_ds = train_ds.remove_columns([col for col in remove_cols if col in train_ds.column_names])
test_ds = test_ds.remove_columns([col for col in remove_cols if col in test_ds.column_names])

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_total_limit=1,
    seed=42,
    report_to="none",
    run_name="finbert-local-kaggle",  # appears in wandb dashboard
    load_best_model_at_end=True,  # keeps best checkpoint by eval metric
    metric_for_best_model="f1",   # use F1 to pick the best
    greater_is_better=True,
)

In [None]:
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids

    # Identify the label ID for "neutral" (case-insensitive)
    neutral_label_id = None
    for key in label_map.keys():
        if key.lower() == "neutral":
            neutral_label_id = label_map[key]
            break
    
    if neutral_label_id is not None:
        # Ignore neutral predictions and ground truths
        mask = (labels != neutral_label_id) & (preds != neutral_label_id)
        # mask = labels != neutral_label_id

        if np.sum(mask) == 0:
            return {"accuracy": 1.0, "f1": 1.0}
        preds = preds[mask]
        labels = labels[mask]

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [6]:
# -------------------- FINBERT FINE-TUNING (KAGGLE LOCAL + WANDB) --------------------
# ---------------- CONFIG ----------------
# ----------------------------------------
# ---------------- INIT WANDB ----------------
# ---------------- LOAD DATA ----------------
# ---------------- TOKENIZER ----------------
# ---------------- TRAINING SETUP ----------------
# ---------------- METRICS ----------------
# ---------------- TRAINER ----------------
# ---------------- TRAIN ----------------
trainer.train()
# ---------------- SAVE MODEL ----------------

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

wandb.finish()

print("Training complete. Model saved to:", OUTPUT_DIR)
print("Label mapping:", label_map)

trainer.evaluate()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.033388,0.717105,0.616281
2,1.194500,0.992197,0.739437,0.721963
3,0.884300,1.014713,0.726027,0.701394
4,0.741800,1.113563,0.744828,0.721099
5,0.633000,1.153524,0.715278,0.709407
6,0.463200,1.189672,0.708333,0.701603
7,0.415900,1.265786,0.696552,0.683073
8,0.319100,1.284147,0.734375,0.719016
9,0.270400,1.319678,0.710145,0.701538
10,0.258700,1.333564,0.707143,0.697611




Training complete. Model saved to: ./finbert_finetuned
Label mapping: {'Negative': 0, 'Positive': 1, 'Neutral': 2}




{'eval_loss': 0.9921973943710327,
 'eval_accuracy': 0.7394366197183099,
 'eval_f1': 0.7219629767047157,
 'eval_runtime': 1.1332,
 'eval_samples_per_second': 165.907,
 'eval_steps_per_second': 10.59,
 'epoch': 10.0}