# Evaluating model performance

## Load Data

We create the same train/test split as before and attempt to understand where the model is getting confused.

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:


import random
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from evaluate import load

# Reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_path = "/content/drive/MyDrive/finbert_5class_model_v4"


Mounted at /content/drive


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # evaluation mode

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded. id2label mapping:", model.config.id2label)


Model loaded. id2label mapping: {0: 'moderate_negative', 1: 'moderate_positive', 2: 'negative', 3: 'neutral', 4: 'positive'}


In [None]:
ds = load_dataset("FinGPT/fingpt-sentiment-train", split="train")

mapping_5 = {
    "strong negative": "negative",
    "moderately negative": "moderate_negative",
    "mildly negative": "moderate_negative",
    "negative": "negative",
    "neutral": "neutral",
    "mildly positive": "moderate_positive",
    "moderately positive": "moderate_positive",
    "positive": "positive",
    "strong positive": "positive"
}

def map_to_5(example):
    example["label_5"] = mapping_5[example["output"]]
    return example

ds = ds.map(map_to_5)

id2label = model.config.id2label
label2id = model.config.label2id

def encode_label(example):
    example["label"] = label2id[example["label_5"]]
    return example

ds = ds.map(encode_label)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/529 [00:00<?, ?B/s]

data/train-00000-of-00001-dabab110260ac9(…):   0%|          | 0.00/6.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76772 [00:00<?, ? examples/s]

Map:   0%|          | 0/76772 [00:00<?, ? examples/s]

Map:   0%|          | 0/76772 [00:00<?, ? examples/s]

In [None]:
ds = ds.train_test_split(test_size=0.2, seed=42)
test_ds = ds["test"]


In [None]:
def tokenize(batch):
    return tokenizer(batch["input"], truncation=True, padding="max_length", max_length=128)

test_ds = test_ds.map(tokenize, batched=True)
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label", "input"])


Map:   0%|          | 0/15355 [00:00<?, ? examples/s]

In [None]:
# Dummy TrainingArguments just for prediction
training_args = TrainingArguments(
    output_dir="./tmp_trainer",
    per_device_eval_batch_size=32,
    seed=42,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [None]:
preds = trainer.predict(test_ds)

import numpy as np
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = preds.label_ids

# Convert to label names
pred_labels_names = [id2label[i] for i in pred_labels]
true_labels_names = [id2label[i] for i in true_labels]


In [None]:
import torch.nn.functional as F

# preds.predictions is raw logits
logits = torch.tensor(preds.predictions)  # shape: (num_samples, num_labels)
probs = F.softmax(logits, dim=1).numpy()  # convert to probabilities


In [None]:
from sklearn.metrics import classification_report
import pandas as pd

print(classification_report(true_labels_names, pred_labels_names, target_names=list(id2label.values())))

# Build a DataFrame to inspect mistakes
df_results = pd.DataFrame({
    "sentence": test_ds["input"],
    "true_label": true_labels_names,
    "pred_label": pred_labels_names
})
df_results["correct"] = df_results["true_label"] == df_results["pred_label"]

mistakes = df_results[df_results["correct"] == False]
print("Number of mistakes:", len(mistakes))
mistakes.head(10)


                   precision    recall  f1-score   support

moderate_negative       0.82      0.96      0.88       962
moderate_positive       0.84      0.92      0.88      1729
         negative       0.98      0.97      0.97      2432
          neutral       0.99      0.93      0.95      5880
         positive       0.98      0.98      0.98      4352

         accuracy                           0.95     15355
        macro avg       0.92      0.95      0.93     15355
     weighted avg       0.95      0.95      0.95     15355

Number of mistakes: 749


Unnamed: 0,sentence,true_label,pred_label,correct
44,"Candle Media founder and co-CEO Kevin Mayer, a...",neutral,moderate_positive,False
118,"Buyers Want Cleaner, Lighter Cars That Also Ma...",neutral,moderate_positive,False
150,"Compared with the FTSE 100 index , which rose ...",neutral,negative,False
179,We have well and truly arrived at the December...,neutral,moderate_positive,False
181,Shell to continue search in renewables after l...,neutral,positive,False
190,A look into Cathie Woods's portfolio shows som...,neutral,moderate_positive,False
193,German regulators are reportedly considering w...,neutral,moderate_negative,False
247,The stock has had a rough few months. Investor...,moderate_positive,moderate_negative,False
253,Brixmor 2020 FFO guidance comes in on the ligh...,negative,neutral,False
284,The software company says the deal “is fundame...,neutral,moderate_positive,False


In [None]:
# Create column names for probabilities
prob_columns = [f"prob_{id2label[i]}" for i in range(len(id2label))]

# Build DataFrame
df_results = pd.DataFrame({
    "sentence": test_ds["input"],
    "true_label": true_labels_names,
    "pred_label": pred_labels_names,
    "correct": [t == p for t, p in zip(true_labels_names, pred_labels_names)]
})

# Add probability columns
for i, col in enumerate(prob_columns):
    df_results[col] = probs[:, i]

# Subset only mistakes for inspection
mistakes = df_results[df_results["correct"] == False]

print("Total test samples:", len(df_results))
print("Number of mistakes:", len(mistakes))
print("\nSample misclassifications with probabilities:")
mistakes.head(10)


Total test samples: 15355
Number of mistakes: 749

Sample misclassifications with probabilities:


Unnamed: 0,sentence,true_label,pred_label,correct,prob_moderate_negative,prob_moderate_positive,prob_negative,prob_neutral,prob_positive
44,"Candle Media founder and co-CEO Kevin Mayer, a...",neutral,moderate_positive,False,0.015049,0.766645,0.002867,0.213238,0.002201
118,"Buyers Want Cleaner, Lighter Cars That Also Ma...",neutral,moderate_positive,False,0.009417,0.556857,0.011322,0.391937,0.030467
150,"Compared with the FTSE 100 index , which rose ...",neutral,negative,False,0.007076,0.000898,0.977269,0.012671,0.002086
179,We have well and truly arrived at the December...,neutral,moderate_positive,False,0.010845,0.785546,0.003435,0.197411,0.002763
181,Shell to continue search in renewables after l...,neutral,positive,False,0.000309,0.000958,0.101741,0.39165,0.505343
190,A look into Cathie Woods's portfolio shows som...,neutral,moderate_positive,False,0.006556,0.898555,0.003585,0.08758,0.003723
193,German regulators are reportedly considering w...,neutral,moderate_negative,False,0.7669,0.081769,0.009449,0.141188,0.000694
247,The stock has had a rough few months. Investor...,moderate_positive,moderate_negative,False,0.664836,0.301821,0.015439,0.016265,0.00164
253,Brixmor 2020 FFO guidance comes in on the ligh...,negative,neutral,False,0.001143,0.00337,0.011875,0.820372,0.163242
284,The software company says the deal “is fundame...,neutral,moderate_positive,False,0.012922,0.916081,0.003445,0.017693,0.04986


In [None]:
ordinal_map = {
    "negative": 0,
    "moderate_negative": 1,
    "neutral": 2,
    "moderate_positive": 3,
    "positive": 4
}

df_results["true_ord"] = df_results["true_label"].map(ordinal_map)
df_results["pred_ord"] = df_results["pred_label"].map(ordinal_map)

# Ordinal distance
df_results["error_distance"] = (df_results["true_ord"] - df_results["pred_ord"]).abs()


In [None]:

extreme_errors = df_results[df_results["error_distance"] == 4]

cols_to_show = [
    "sentence",
    "true_label",
    "pred_label",
    "error_distance"
] + prob_columns

extreme_errors.head(15)[cols_to_show]


Unnamed: 0,sentence,true_label,pred_label,error_distance,prob_moderate_negative,prob_moderate_positive,prob_negative,prob_neutral,prob_positive
462,Consumer Credit Growth Rebounds In October #ec...,positive,negative,4,0.001214,0.001517,0.764264,0.027833,0.205174
828,BREAKING: Mortgage forbearance requests jump n...,negative,positive,4,0.000947,0.001273,0.120688,0.096178,0.780914
2372,$ECONX: November Unemployment Rate 3.5% vs 3.6...,positive,negative,4,0.000942,0.001487,0.566866,0.087163,0.343543
2564,U.S. stocks are 📈 https://t.co/2cVbN4AIF4 http...,positive,negative,4,0.001256,0.001455,0.74177,0.014568,0.24095
2802,Estee Lauder Q2 EPS $1.52 vs. $1.55 a year go,negative,positive,4,0.006026,0.002943,0.450742,0.036591,0.503697
3528,Gold Mine Output Falls For First Time Since 20...,positive,negative,4,0.000442,0.000901,0.756781,0.200593,0.041283
4270,Hedge Funds Aren’t Crazy About Hutchison China...,negative,positive,4,0.005467,0.009523,0.026894,0.028481,0.929634
4896,"At the end of October, housing inventory natio...",positive,negative,4,0.000668,0.000933,0.804074,0.104222,0.090103
5274,From the new quarterly forecast: The estimate ...,positive,negative,4,0.000633,0.000649,0.907347,0.068599,0.022772
5774,Ray Dalio says the global economy is heading f...,negative,positive,4,0.000949,0.001644,0.211227,0.16772,0.618459


In [None]:
for i, row in extreme_errors.head(10).iterrows():
    print(f"\nSentence: {row['sentence']}")
    print(f"True: {row['true_label']}    Predicted: {row['pred_label']}")
    print("-" * 60)



Sentence: Consumer Credit Growth Rebounds In October #economy #MarketScreener https://t.co/Q24jOfYhMs https://t.co/QUq0lgYCaq
True: positive    Predicted: negative
------------------------------------------------------------

Sentence: BREAKING: Mortgage forbearance requests jump nearly 2,000% as borrowers seek relief during coronavirus outbreak https://t.co/5vbo5C2VC9
True: negative    Predicted: positive
------------------------------------------------------------

Sentence: $ECONX: November Unemployment Rate 3.5% vs 3.6% https://t.co/M9gpUZQoF7 consensus https://t.co/p5wq6wH9wr
True: positive    Predicted: negative
------------------------------------------------------------

Sentence: U.S. stocks are 📈 https://t.co/2cVbN4AIF4 https://t.co/iAQJgLcbR6
True: positive    Predicted: negative
------------------------------------------------------------

Sentence: Estee Lauder Q2 EPS $1.52 vs. $1.55 a year go
True: negative    Predicted: positive
-----------------------------------------