In [1]:
%pip install transformers datasets torch scikit-learn pandas

Collecting transformers
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Using cached datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached pyyaml-6.0.3-cp313-cp313-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting httpx<1.0.0 (from datasets)
  Using cached httpx-0.28.1-py3-none-any.whl.metad

In [4]:
import pandas as pd

# Load your specific file
df = pd.read_csv('../archive (3)/analyst_ratings_processed.csv')

print(f"âœ… Loaded {len(df)} rows.")
print("\nFirst 3 rows:")
print(df.head(3))

print("\nColumn Names:")
print(list(df.columns))

# Assuming there is a column like 'stock_rating' or 'value', let's check the unique labels
# (Update 'stock_rating' below if the print above shows a different name)
if 'stock_rating' in df.columns:
    print("\nUnique Ratings found:")
    print(df['stock_rating'].value_counts().head(10))

âœ… Loaded 1400469 rows.

First 3 rows:
   Unnamed: 0                                       title  \
0         0.0     Stocks That Hit 52-Week Highs On Friday   
1         1.0  Stocks That Hit 52-Week Highs On Wednesday   
2         2.0               71 Biggest Movers From Friday   

                        date stock  
0  2020-06-05 10:30:00-04:00     A  
1  2020-06-03 10:45:00-04:00     A  
2  2020-05-26 04:30:00-04:00     A  

Column Names:
['Unnamed: 0', 'title', 'date', 'stock']


In [5]:
import torch
import pandas as pd
import plotly.express as px
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm

# 1. Setup Device (GPU is much faster)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 2. Load the Pre-Trained FinBERT (The Expert)
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert').to(device)
model.eval()

# 3. Filter Data for a Specific Stock (e.g., AAPL)
# The dataset uses tickers. Let's grab Apple.
target_stock = 'AAPL'
df_stock = df[df['stock'] == target_stock].head(100).copy() # Limit to 100 for speed demo

print(f"Analyzing {len(df_stock)} headlines for {target_stock}...")

# 4. The Inference Loop (Reading the News)
predictions = []
confidence_scores = []

for title in tqdm(df_stock['title']):
    # Tokenize
    inputs = tokenizer(title, return_tensors="pt", truncation=True, padding=True, max_length=64).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Logic: Class 0=Positive, 1=Negative, 2=Neutral (Check config usually)
    # FinBERT 'ProsusAI' Output: [Positive, Negative, Neutral]
    # We will create a "Sentiment Score": Positive Prob - Negative Prob
    # Score > 0 = Bullish, Score < 0 = Bearish
    
    # ProsusAI mapping: 0=Positive, 1=Negative, 2=Neutral
    score = probs[0][0].item() - probs[0][1].item() # Positive - Negative
    predictions.append(score)

df_stock['sentiment_score'] = predictions

# 5. Visualize the "Alpha Signal"
fig = px.bar(df_stock, x='date', y='sentiment_score',
             color='sentiment_score',
             color_continuous_scale=['red', 'gray', 'green'],
             hover_data=['title'],
             title=f'AI-Generated Sentiment Signal for {target_stock} (FinBERT)')
fig.show()

Using device: cpu


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Analyzing 100 headlines for AAPL...


 75%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ  | 75/100 [00:01<00:00, 52.50it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [00:01<00:00, 51.08it/s]


In [6]:
# Quick Sanity Check: Print the raw logic
print(f"Checking first 5 headlines for {target_stock}...\n")

for i, row in df_stock.head(5).iterrows():
    title = row['title']
    score = row['sentiment_score']
    
    # Interpret the score
    if score > 0.1: sentiment = "ðŸŸ¢ BULLISH"
    elif score < -0.1: sentiment = "ðŸ”´ BEARISH"
    else: sentiment = "âšª NEUTRAL"
    
    print(f"ðŸ“° {title}")
    print(f"   â†³ {sentiment} (Score: {score:.4f})\n")

Checking first 5 headlines for AAPL...

ðŸ“° Tech Stocks And FAANGS Strong Again To Start Day As Market Awaits Fed
   â†³ ðŸŸ¢ BULLISH (Score: 0.8182)

ðŸ“° 10 Biggest Price Target Changes For Wednesday
   â†³ ðŸ”´ BEARISH (Score: -0.1148)

ðŸ“° Benzinga Pro's Top 5 Stocks To Watch For Wed., Jun. 10, 2020: AAPL, BAC, NIO, SONO, GLW
   â†³ âšª NEUTRAL (Score: 0.0361)

ðŸ“° Deutsche Bank Maintains Buy on Apple, Raises Price Target to $350
   â†³ ðŸŸ¢ BULLISH (Score: 0.7173)

ðŸ“° Apple To Let Users Trade In Their Mac Computers For Credit At US, Canada Stores: Report
   â†³ âšª NEUTRAL (Score: 0.0229)



In [7]:
# Save the analyzed data
df_stock.to_csv('aapl_sentiment_analysis.csv', index=False)
print("âœ… Saved analysis to aapl_sentiment_analysis.csv")

âœ… Saved analysis to aapl_sentiment_analysis.csv
