In [1]:
# Install dependencies (uncomment if needed)
# !pip install transformers datasets torch sklearn pandas numpy matplotlib

import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("tableau_ready_sentiment_data.csv")
df = df[['tweet_body', 'pct_change']].dropna()
df.head()


Unnamed: 0,tweet_body,pct_change
0,Yup,0.118376
1,Massive public manipulation,0.118376
2,🤣🤣,-5.3476
3,Prescient,-1.342455
4,Congratulations Tesla team on a great year!!,-0.663789


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4709 entries, 0 to 4708
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   tweet_body  4709 non-null   object 
 1   pct_change  4709 non-null   float64
dtypes: float64(1), object(1)
memory usage: 73.7+ KB


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_function(texts, max_len=64):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )


In [4]:
class TweetDataset(Dataset):
    def __init__(self, texts, targets):
        self.encodings = tokenize_function(texts)
        self.targets = torch.tensor(targets.values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['target'] = self.targets[idx]
        return item


In [7]:
# Make sure you're passing list of strings, not a Series object
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['tweet_body'].tolist(),  # Convert to list of strings
    df['pct_change'],
    test_size=0.2,
    random_state=42
)

train_dataset = TweetDataset(train_texts, train_labels)
test_dataset = TweetDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [8]:
class LSTMRegressor(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128):
        super(LSTMRegressor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out.squeeze()


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMRegressor(len(vocab)).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

def train_lstm_epoch(model, data_loader):
    model.train()
    total_loss = 0
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)


NameError: name 'vocab' is not defined