In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from collections import Counter

In [2]:
df = pd.read_csv("tableau_ready_sentiment_data.csv")
df = df[['tweet_body', 'pct_change']].dropna()
df['tokens'] = df['tweet_body'].apply(lambda x: x.lower().split())
df.head()

Unnamed: 0,tweet_body,pct_change,tokens
0,Yup,0.118376,[yup]
1,Massive public manipulation,0.118376,"[massive, public, manipulation]"
2,🤣🤣,-5.3476,[🤣🤣]
3,Prescient,-1.342455,[prescient]
4,Congratulations Tesla team on a great year!!,-0.663789,"[congratulations, tesla, team, on, a, great, y..."


In [3]:
# Flatten all tokens and build vocab
all_tokens = [token for row in df['tokens'] for token in row]
token_counts = Counter(all_tokens)

vocab = {token: idx + 2 for idx, (token, _) in enumerate(token_counts.items())}  # start from 2
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

In [4]:
MAX_LEN = 32

def encode(tokens):
    encoded = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    padded = encoded[:MAX_LEN] + [vocab["<PAD>"]] * (MAX_LEN - len(encoded))
    return padded

df['encoded'] = df['tokens'].apply(encode)

In [5]:
class TweetDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs.tolist(), dtype=torch.long)
        self.targets = torch.tensor(targets.values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

X_train, X_test, y_train, y_test = train_test_split(df['encoded'], df['pct_change'], test_size=0.2, random_state=42)

train_dataset = TweetDataset(X_train, y_train)
test_dataset = TweetDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [6]:
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128):
        super(SimpleLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        embeds = self.embedding(x)
        _, (hidden, _) = self.lstm(embeds)
        return self.fc(hidden[-1]).squeeze()

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleLSTM(vocab_size=len(vocab)).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

def train_epoch(model, data_loader):
    model.train()
    total_loss = 0
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

In [8]:
for epoch in range(5):
    loss = train_epoch(model, train_loader)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

Epoch 1, Loss: 9.0411
Epoch 2, Loss: 8.9479
Epoch 3, Loss: 8.8027
Epoch 4, Loss: 8.5823
Epoch 5, Loss: 8.3762


In [9]:
model.eval()
preds, actuals = [], []
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        preds.extend(outputs.cpu().numpy())
        actuals.extend(targets.cpu().numpy())

mse = mean_squared_error(actuals, preds)
r2 = r2_score(actuals, preds)
print(f"MSE: {mse:.4f}, R²: {r2:.4f}")

MSE: 8.8625, R²: -0.0034


R²: -0.0034 → the model is performing very slightly worse than a simple mean-based baseline (i.e., always predicting the average value).