# Financial PhraseBank - Exploratory Data Analysis


In [None]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np

In [None]:
ds = load_dataset('financial_phrasebank', 'sentences_allagree', trust_remote_code=True)
train = ds['train']
print(f'Total samples: {len(train)}')

In [None]:
# Label distribution
labels = train['label']
label_names = {0: 'positive', 1: 'negative', 2: 'neutral'}
counts = [labels.count(i) for i in range(3)]
names = [label_names[i] for i in range(3)]

plt.figure(figsize=(8, 5))
plt.bar(names, counts, color=['#2ecc71', '#e74c3c', '#95a5a6'])
plt.title('Label Distribution')
plt.ylabel('Count')
plt.savefig('label_dist.png', dpi=100, bbox_inches='tight')
plt.show()

In [None]:
# Text length distribution
lengths = [len(s.split()) for s in train['sentence']]
print(f'Avg words: {np.mean(lengths):.1f}')
print(f'Max words: {max(lengths)}')
print(f'Min words: {min(lengths)}')

plt.figure(figsize=(8, 5))
plt.hist(lengths, bins=30, edgecolor='black', alpha=0.7)
plt.title('Text Length Distribution (words)')
plt.xlabel('Number of words')
plt.ylabel('Frequency')
plt.show()

## Takeaways
- ~2264 samples total (sentences_allagree subset)
- Class imbalance: way more neutral than negative
- Most texts are pretty short (10-30 words), good for LLM context
- Will need to think about class weights or oversampling for negative class
