In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv("tableau_ready_sentiment_data.csv")  # Update the path if needed

# Show basic info and preview
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4709 entries, 0 to 4708
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tweet_id            4709 non-null   int64  
 1   date                4709 non-null   object 
 2   tweet_body          4709 non-null   object 
 3   roberta_sentiment   4709 non-null   object 
 4   roberta_pos_score   4709 non-null   float64
 5   roberta_neg_score   4709 non-null   float64
 6   roberta_neu_score   4709 non-null   float64
 7   sentiment_polarity  4709 non-null   float64
 8   like_count          4709 non-null   int64  
 9   retweet_count       4709 non-null   int64  
 10  engagement_score    4709 non-null   int64  
 11  Open                4709 non-null   float64
 12  Close               4709 non-null   float64
 13  pct_change          4709 non-null   float64
dtypes: float64(7), int64(4), object(3)
memory usage: 515.2+ KB
None


Unnamed: 0,tweet_id,date,tweet_body,roberta_sentiment,roberta_pos_score,roberta_neg_score,roberta_neu_score,sentiment_polarity,like_count,retweet_count,engagement_score,Open,Close,pct_change
0,1655978502187778073,2023-05-09,Yup,neutral,0.290412,0.215288,0.4943,0.075124,39533,3255,42788,168.949997,169.149994,0.118376
1,1655968899903418373,2023-05-09,Massive public manipulation,negative,0.009549,0.72385,0.266601,-0.714301,49528,9811,59339,168.949997,169.149994,0.118376
2,1646228474628280326,2023-04-12,🤣🤣,neutral,0.310404,0.235308,0.454288,0.075095,108462,10198,118660,190.740005,180.539993,-5.3476
3,1640171198091866114,2023-03-27,Prescient,neutral,0.204723,0.126566,0.668711,0.078156,56272,9193,65465,194.419998,191.809998,-1.342455
4,1742235895166652609,2024-01-02,Congratulations Tesla team on a great year!!,positive,0.991541,0.001338,0.007122,0.990203,67751,5222,72973,250.080002,248.419998,-0.663789


In [2]:
# Cell 2: Preprocess text and combine with numerical features

# Drop rows with missing tweet text (just in case)
df = df.dropna(subset=["tweet_body"])

# Convert tweet text into TF-IDF features (max 300 terms)
vectorizer = TfidfVectorizer(max_features=300)
X_text = vectorizer.fit_transform(df["tweet_body"]).toarray()

# Select numerical columns to include as features
numerical_cols = [
    "roberta_pos_score", 
    "roberta_neg_score", 
    "roberta_neu_score", 
    "sentiment_polarity", 
    "engagement_score"
]
X_num = df[numerical_cols].values

# Normalize the numerical features using StandardScaler
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

# Combine text features and numerical features into one array
X = np.hstack((X_text, X_num_scaled))


In [3]:
# Cell 3: Define target variables and split the dataset

# Define target for classification: sentiment labels
y_class = df["roberta_sentiment"]

# Define target for regression: percentage change in stock price
y_reg = df["pct_change"]

# Split data for classification (stratify to preserve class balance)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Split data for regression (no stratify needed)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)


In [4]:
# Cell 4: Install XGBoost 
!pip install xgboost



In [5]:
# Cell 5: Train and evaluate regression with XGBoost

from xgboost import XGBRegressor

# Initialize the XGBoost model with 100 trees and a learning rate of 0.1
xgb_reg = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model on the regression training data
xgb_reg.fit(X_train_reg, y_train_reg)

# Predict on the test set
y_pred_xgb = xgb_reg.predict(X_test_reg)

# Calculate evaluation metrics
mae_xgb = mean_absolute_error(y_test_reg, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test_reg, y_pred_xgb))
r2_xgb = r2_score(y_test_reg, y_pred_xgb)

# Display results
print(f"XGBoost - MAE: {mae_xgb:.4f}")
print(f"XGBoost - RMSE: {rmse_xgb:.4f}")
print(f"XGBoost - R² Score: {r2_xgb:.4f}")


XGBoost - MAE: 2.4119
XGBoost - RMSE: 3.0341
XGBoost - R² Score: -0.0423


In [6]:
# Cell 6: Feature engineering to improve regression performance

# Feature 1: Tweet length (number of characters)
df["tweet_length"] = df["tweet_body"].apply(len)

# Feature 2: Day of the week the tweet was posted (0 = Monday, 6 = Sunday)
df["day_of_week"] = pd.to_datetime(df["date"]).dt.dayofweek  # 0 = Monday, 6 = Sunday

# Feature 3: One-hot encode the categorical sentiment labels
sentiment_dummies = pd.get_dummies(df["roberta_sentiment"], prefix="sent")

# Feature 4: Interaction features (engagement * sentiment score)
df["eng_pos"] = df["engagement_score"] * df["roberta_pos_score"]
df["eng_neg"] = df["engagement_score"] * df["roberta_neg_score"]
df["eng_neu"] = df["engagement_score"] * df["roberta_neu_score"]

# Combine all numeric features into one DataFrame
feature_cols = [
    "roberta_pos_score", "roberta_neg_score", "roberta_neu_score",
    "sentiment_polarity", "engagement_score",
    "tweet_length", "day_of_week",
    "eng_pos", "eng_neg", "eng_neu"
]

X_new = pd.concat([df[feature_cols], sentiment_dummies], axis=1)

# Standardize the features to improve model performance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

# Define regression target again
y_reg = df["pct_change"]


In [7]:
# Cell 7: Re-train XGBoost regression with new features

# Split engineered feature matrix and target into training and test sets
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_scaled, y_reg, test_size=0.2, random_state=42
)

# Initialize and train a new XGBoost regression model
xgb_reg2 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_reg2.fit(X_train_new, y_train_new)

# Predict stock price change on the test set
y_pred_new = xgb_reg2.predict(X_test_new)

# Evaluate model performance using common regression metrics
mae_new = mean_absolute_error(y_test_new, y_pred_new)
rmse_new = np.sqrt(mean_squared_error(y_test_new, y_pred_new))
r2_new = r2_score(y_test_new, y_pred_new)

# Show results
print(f"Improved XGBoost - MAE: {mae_new:.4f}")
print(f"Improved XGBoost - RMSE: {rmse_new:.4f}")
print(f"Improved XGBoost - R² Score: {r2_new:.4f}")


Improved XGBoost - MAE: 2.4245
Improved XGBoost - RMSE: 3.0376
Improved XGBoost - R² Score: -0.0447


Conclusion (so far)
Even with better features:

The model still isn't capturing a meaningful signal from the tweets to predict % stock change.

Tweets alone may not be strong predictors without market context.

In [8]:
# Cell 8: Generate BERT embeddings for tweet text

!pip install transformers torch --quiet

# Import BERT model and tokenizer from HuggingFace Transformers
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm # To show progress bar

# Load pretrained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
bert.eval()

# Define function to generate mean-pooled BERT embeddings for a single tweet
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = bert(**inputs)
    # Return the average of all token embeddings for this tweet
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate embeddings for every tweet in the dataset (can take time)
tweet_texts = df["tweet_body"].tolist()
bert_embeddings = [get_bert_embedding(t) for t in tqdm(tweet_texts)]


100%|██████████████████████████████████████████████████████████████████████████████| 4709/4709 [03:20<00:00, 23.53it/s]


In [9]:
# Cell 9: Check embedding shape
# Convert list of embeddings to a NumPy array for easier processing
bert_embeddings = np.array(bert_embeddings)
bert_embeddings.shape


(4709, 768)

In [10]:
# Cell 10: Combine BERT embeddings with engineered numeric features

# Use the same engineered numeric features from earlier
feature_cols = [
    "roberta_pos_score", "roberta_neg_score", "roberta_neu_score",
    "sentiment_polarity", "engagement_score",
    "tweet_length", "day_of_week",
    "eng_pos", "eng_neg", "eng_neu"
]

# One-hot encode sentiment again
sentiment_dummies = pd.get_dummies(df["roberta_sentiment"], prefix="sent")

# Concatenate all numeric features
X_numeric = pd.concat([df[feature_cols], sentiment_dummies], axis=1)

# Standardize numeric features for consistency
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Concatenate BERT embeddings with scaled numeric features
X_final = np.hstack((bert_embeddings, X_numeric_scaled))


In [11]:
# Cell 11: Train XGBoost regression model using BERT + numeric features

# Target variable
y_final = df["pct_change"]

# Train/test split
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42
)

# Initialize and train XGBoost model
xgb_bert = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_bert.fit(X_train_final, y_train_final)

# Predict and evaluate
y_pred_final = xgb_bert.predict(X_test_final)

mae_bert = mean_absolute_error(y_test_final, y_pred_final)
rmse_bert = np.sqrt(mean_squared_error(y_test_final, y_pred_final))
r2_bert = r2_score(y_test_final, y_pred_final)

# Results
print(f"XGBoost (BERT + Features) - MAE: {mae_bert:.4f}")
print(f"XGBoost (BERT + Features) - RMSE: {rmse_bert:.4f}")
print(f"XGBoost (BERT + Features) - R² Score: {r2_bert:.4f}")


XGBoost (BERT + Features) - MAE: 2.3975
XGBoost (BERT + Features) - RMSE: 3.0500
XGBoost (BERT + Features) - R² Score: -0.0532


In [12]:
# Cell 12: Install TensorFlow (run once)
!pip install tensorflow




In [13]:
# Cell 13: Tokenize and pad tweet text for LSTM

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize tokenizer with a vocabulary size of 10,000 and an out-of-vocab token
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")

# Fit the tokenizer on tweet text
tokenizer.fit_on_texts(df["tweet_body"])

# Convert the tweets into sequences of integers
sequences = tokenizer.texts_to_sequences(df["tweet_body"])

# Pad/truncate all sequences to the same length for input into LSTM
max_length = 50
X_seq = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Define the regression target: percent change in Tesla stock price
y_seq = df["pct_change"]


In [19]:
# Cell 14: Define and train LSTM model for regression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Split the padded sequences and target for training/testing
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_seq, y_seq, test_size=0.2, random_state=42
)

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64),  # Embedding layer to learn word vectors
    LSTM(64, return_sequences=False),           # LSTM layer for sequence processing
    Dropout(0.3),                               # Dropout to reduce overfitting
    Dense(32, activation='relu'),               # Fully connected hidden layer
    Dropout(0.2),                               # Another dropout layer
    Dense(1)                                    # Output layer for regression
])

# Compile the model using mean squared error loss and Adam optimizer
model.compile(optimizer='adam', loss='mse')

# Train the model for 10 epochs, using 10% of training data for validation
history = model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32,
                    validation_split=0.1, verbose=1)


Epoch 1/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - loss: 9.2240 - val_loss: 8.1065
Epoch 2/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 9.1766 - val_loss: 8.1093
Epoch 3/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 9.1391 - val_loss: 8.1158
Epoch 4/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 8.9491 - val_loss: 8.1345
Epoch 5/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 9.0219 - val_loss: 8.1042
Epoch 6/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 8.9912 - val_loss: 8.1064
Epoch 7/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 9.3680 - val_loss: 8.1993
Epoch 8/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 8.8217 - val_loss: 8.1172
Epoch 9/10
[1m106/106[0m [32m

In [20]:
# Cell 15: Evaluate LSTM model performance

# Make predictions on the test set using the trained LSTM model
y_pred_lstm = model.predict(X_test_lstm).flatten()

# Compute regression metrics for performance evaluation
mae_lstm = mean_absolute_error(y_test_lstm, y_pred_lstm)
rmse_lstm = np.sqrt(mean_squared_error(y_test_lstm, y_pred_lstm))
r2_lstm = r2_score(y_test_lstm, y_pred_lstm)

# Show results
print(f"LSTM - MAE: {mae_lstm:.4f}")
print(f"LSTM - RMSE: {rmse_lstm:.4f}")
print(f"LSTM - R² Score: {r2_lstm:.4f}")


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
LSTM - MAE: 2.3459
LSTM - RMSE: 2.9728
LSTM - R² Score: -0.0006


In [16]:
import pickle as pkl

pkl.dump(model, open('model.pkl','wb'))