In [1]:
!pip install transformers torch yfinance -q

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

mega_df = pd.read_csv('/content/drive/MyDrive/mega_articles.csv')
print("Shape:", mega_df.shape)
mega_df.head()

Shape: (272, 2)


Unnamed: 0,SQLDATE,mega_article
0,2025-01-01,dead tesla cybertruck explodes outside trump h...
1,2025-01-02,cybertruck explosion suspect matthew livelsber...
2,2025-01-03,las vegas soldier’s bomb tesla outside trump h...
3,2025-01-04,lvpd identify driver cybertruck explosion hot ...
4,2025-01-05,tesla sales dropped first annual decline dozen...


In [4]:
import yfinance as yf

# Convert date column
mega_df['SQLDATE'] = pd.to_datetime(mega_df['SQLDATE'])

# Download Tesla stock data
start_date = mega_df['SQLDATE'].min()
end_date = mega_df['SQLDATE'].max() + pd.Timedelta(days=1)

tesla = yf.download('TSLA', start=start_date, end=end_date)
tesla = tesla.reset_index()
tesla.columns = tesla.columns.get_level_values(0)

# Calculate daily % change
tesla['pct_change'] = tesla['Close'].pct_change() * 100

# Create labels: negative (<-2%), neutral (-2% to +2%), positive (>+2%)
tesla['label'] = 'neutral'
tesla.loc[tesla['pct_change'] > 2, 'label'] = 'positive'
tesla.loc[tesla['pct_change'] < -2, 'label'] = 'negative'

# Merge with mega articles
tesla = tesla.rename(columns={'Date': 'SQLDATE'})
data = pd.merge(mega_df, tesla[['SQLDATE', 'label']], on='SQLDATE', how='inner')

print("Shape after merge:", data.shape)
print(data['label'].value_counts())

  tesla = yf.download('TSLA', start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed

Shape after merge: (188, 3)
label
neutral     81
positive    58
negative    49
Name: count, dtype: int64





In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load FinBERT
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Device:", device)
print("FinBERT loaded")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device: cuda
FinBERT loaded


In [6]:
from tqdm import tqdm

def get_finbert_sentiment_chunked(text, max_length=512):
    """Process text in chunks and average sentiment"""
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0

    # Split into chunks of ~400 words
    words = text.split()
    chunk_size = 400
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

    scores = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
        score = probs[0] - probs[1]
        scores.append(score)

    return np.mean(scores)

# Run with chunking
finbert_scores_chunked = []
for text in tqdm(data['mega_article']):
    score = get_finbert_sentiment_chunked(text)
    finbert_scores_chunked.append(score)

data['finbert_sentiment'] = finbert_scores_chunked
print("Done!")

100%|██████████| 188/188 [15:39<00:00,  5.00s/it]

Done!





In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
X = data[['finbert_sentiment']]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    negative       0.33      0.50      0.40        10
     neutral       0.59      0.81      0.68        16
    positive       0.00      0.00      0.00        12

    accuracy                           0.47        38
   macro avg       0.31      0.44      0.36        38
weighted avg       0.34      0.47      0.39        38

Confusion Matrix:
[[ 5  4  1]
 [ 3 13  0]
 [ 7  5  0]]


In [9]:
print(data['finbert_sentiment'].describe())
print("\nBy label:")
print(data.groupby('label')['finbert_sentiment'].mean())

count    188.000000
mean      -0.060936
std        0.040554
min       -0.162072
25%       -0.089797
50%       -0.058342
75%       -0.033889
max        0.030266
Name: finbert_sentiment, dtype: float64

By label:
label
negative   -0.071557
neutral    -0.052171
positive   -0.064204
Name: finbert_sentiment, dtype: float64


In [10]:
test_texts = [
    "Tesla stock surged 10% after record profits",
    "Tesla crashed and lost billions",
    "Tesla held steady today"
]

for text in test_texts:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
    print(f"Text: {text}")
    print(f"Probs: {probs}")
    print(f"Labels: {model.config.id2label}")
    print()

Text: Tesla stock surged 10% after record profits
Probs: [0.94773555 0.0201435  0.03212105]
Labels: {0: 'positive', 1: 'negative', 2: 'neutral'}

Text: Tesla crashed and lost billions
Probs: [0.02868038 0.5299014  0.44141823]
Labels: {0: 'positive', 1: 'negative', 2: 'neutral'}

Text: Tesla held steady today
Probs: [0.56496143 0.1477102  0.28732833]
Labels: {0: 'positive', 1: 'negative', 2: 'neutral'}

