# Financial PhraseBank — Dataset Exploration

Quick look at the dataset before we start fine-tuning anything. Using the `sentences_allagree` split since those are the ones where all annotators agreed on the label.

In [None]:
from datasets import load_dataset
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

In [None]:
ds = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)
data = ds["train"]  # only has a train split

print(f"Number of samples: {len(data)}")
print(f"Features: {data.features}")

ok so ~2264 samples total. that's pretty small for fine-tuning but should be enough for QLoRA since we're not updating that many params.

In [None]:
# label mapping — 0=negative, 1=neutral, 2=positive
label_map = {0: "negative", 1: "neutral", 2: "positive"}

label_counts = Counter(data["label"])
for label_id, count in sorted(label_counts.items()):
    pct = count / len(data) * 100
    print(f"{label_map[label_id]:>10}: {count:>5}  ({pct:.1f}%)")

In [None]:
# label distribution
names = [label_map[i] for i in sorted(label_counts.keys())]
counts = [label_counts[i] for i in sorted(label_counts.keys())]
colors = ["#e74c3c", "#95a5a6", "#2ecc71"]

plt.figure(figsize=(7, 4))
plt.bar(names, counts, color=colors, edgecolor="black", linewidth=0.5)
plt.title("Label Distribution (sentences_allagree)")
plt.ylabel("Count")
for i, c in enumerate(counts):
    plt.text(i, c + 15, str(c), ha="center", fontsize=10)
plt.tight_layout()
plt.show()

super imbalanced — neutral dominates, negative is tiny. gonna need to think about this during training (class weights or oversampling or something)

In [None]:
# examples from each class
for label_id in sorted(label_map.keys()):
    print(f"\n--- {label_map[label_id].upper()} ---")
    examples = [s for s, l in zip(data["sentence"], data["label"]) if l == label_id][:3]
    for ex in examples:
        print(f"  • {ex[:120]}")

In [None]:
# sentence length stats (just splitting on spaces, nothing fancy)
word_counts = [len(s.split()) for s in data["sentence"]]
char_counts = [len(s) for s in data["sentence"]]

print(f"Word counts — mean: {np.mean(word_counts):.1f}, median: {np.median(word_counts):.0f}, "
      f"min: {min(word_counts)}, max: {max(word_counts)}")
print(f"Char counts — mean: {np.mean(char_counts):.1f}, median: {np.median(char_counts):.0f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(word_counts, bins=30, edgecolor="black", alpha=0.7, color="steelblue")
axes[0].set_title("Word Count Distribution")
axes[0].set_xlabel("# words")
axes[0].set_ylabel("Frequency")

axes[1].hist(char_counts, bins=30, edgecolor="black", alpha=0.7, color="coral")
axes[1].set_title("Character Count Distribution")
axes[1].set_xlabel("# characters")

plt.tight_layout()
plt.show()

most sentences are pretty short, like 15-30 words. good — won't need huge context windows for this.

In [None]:
# word frequency per class
import re
from collections import defaultdict

stopwords = {"the", "a", "an", "in", "of", "to", "and", "for", "is", "was", "its",
             "it", "on", "by", "with", "from", "at", "as", "has", "had", "that",
             "this", "are", "were", "be", "been", "will", "or", "which", "also",
             "than", "have", "not", "but", "s", "said", "would", "their", "about"}

class_words = defaultdict(list)
for sentence, label in zip(data["sentence"], data["label"]):
    tokens = re.findall(r"\b[a-z]+\b", sentence.lower())
    tokens = [t for t in tokens if t not in stopwords and len(t) > 2]
    class_words[label].extend(tokens)

for label_id in sorted(label_map.keys()):
    top = Counter(class_words[label_id]).most_common(15)
    print(f"\n{label_map[label_id].upper()} — top 15 words:")
    print(", ".join(f"{w} ({c})" for w, c in top))

In [None]:
# quick keyword pattern check
keywords = {
    "positive": ["profit", "growth", "increased", "rose", "improved", "gains"],
    "negative": ["loss", "declined", "fell", "dropped", "decreased", "lower"],
    "neutral":  ["reported", "announced", "according", "expects", "company", "shares"]
}

print("Keyword hit rates per class:\n")
for sentiment, kws in keywords.items():
    label_id = {v: k for k, v in label_map.items()}[sentiment]
    sents = [s.lower() for s, l in zip(data["sentence"], data["label"]) if l == label_id]
    total = len(sents)
    for kw in kws:
        hits = sum(1 for s in sents if kw in s)
        print(f"  {sentiment:>8} | '{kw}': {hits}/{total} ({hits/total*100:.1f}%)")
    print()

makes sense that words like "profit" and "growth" show up more in positive, and "loss"/"declined" in negative. the neutral class is more about reporting language ("announced", "reported"). these patterns are actually pretty strong — explains why even bag-of-words models can get decent accuracy on this.

## Takeaways

so the dataset is small (~2264 samples) but seems well-curated — the `sentences_allagree` subset means all annotators agreed, so labels should be clean. the class imbalance is real though, especially the tiny negative set. the keyword patterns are pretty clear which is why even simple models do ok on this. Let's see if QLoRA can push it further by understanding the actual financial context beyond just keywords.