In [1]:
# Install dependencies (uncomment if needed)
# !pip install transformers datasets torch sklearn pandas numpy matplotlib

import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv("tableau_ready_sentiment_data.csv")
df = df[['tweet_body', 'pct_change']].dropna()
df.head()


Unnamed: 0,tweet_body,pct_change
0,Yup,0.118376
1,Massive public manipulation,0.118376
2,ðŸ¤£ðŸ¤£,-5.3476
3,Prescient,-1.342455
4,Congratulations Tesla team on a great year!!,-0.663789


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(texts, max_len=64):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )


In [6]:
class TweetDataset(Dataset):
    def __init__(self, texts, targets):
        self.encodings = tokenize_function(texts)  # No .tolist()
        self.targets = torch.tensor(targets.values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['target'] = self.targets[idx]
        return item


In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['tweet_body'].tolist(), df['pct_change'], test_size=0.2, random_state=42
)

train_dataset = TweetDataset(train_texts, train_labels)
test_dataset = TweetDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [8]:
class BERTRegressor(nn.Module):
    def __init__(self):
        super(BERTRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = output.pooler_output
        x = self.dropout(cls_output)
        return self.linear(x).squeeze()


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTRegressor().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()

def train_epoch(model, data_loader):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)


In [12]:
from tqdm import tqdm  # fallback version

epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    epoch_iterator = tqdm(train_loader, desc="Training")
    model.train()
    total_loss = 0
    for batch in epoch_iterator:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        epoch_iterator.set_postfix(loss=total_loss / (epoch_iterator.n + 1))
    
    print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss / len(train_loader):.4f}")


Epoch 1/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 236/236 [07:59<00:00,  2.03s/it, loss=8.94]


Epoch 1 completed. Avg Loss: 8.9390
Epoch 2/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 236/236 [07:54<00:00,  2.01s/it, loss=8.28]


Epoch 2 completed. Avg Loss: 8.2758
Epoch 3/3


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 236/236 [07:57<00:00,  2.03s/it, loss=6.35]

Epoch 3 completed. Avg Loss: 6.3511





In [13]:
model.eval()
predictions, true_vals = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.cpu().numpy())
        true_vals.extend(targets.cpu().numpy())

mse = mean_squared_error(true_vals, predictions)
r2 = r2_score(true_vals, predictions)
print(f"MSE: {mse:.4f}, RÂ²: {r2:.4f}")


MSE: 11.1412, RÂ²: -0.2614


RÂ²: -0.2614 â†’ actually doing worse than just predicting the average every time.