# Elon Musk Tweet Preprocessing

Elon Musk Tweet Preprocessing for Tesla Stock Prediction
Extracts three key features:
1. Tweet sentiment (FinBERT)
2. Whether tweeted before market close
3. Whether mentions Tesla/production/delivery


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, time
import pytz
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax

In [None]:
musk_posts = pd.read_csv('all_musk_posts.csv', low_memory=False)
start_date = "2020-01-01"
end_date = "2024-12-31"

In [None]:
!pip install transformers torch pytz



In [None]:
# Tesla-related keywords
tesla_keywords = [
    'tesla', 'tsla', '$tsla', '#tsla',
    'model 3', 'model y', 'model s', 'model x', 'model3', 'modely', 'models', 'modelx',
    'cybertruck', 'roadster', 'semi',
    'production', 'delivery', 'deliveries', 'delivered', 'producing',
    'gigafactory', 'giga', 'fremont', 'shanghai', 'berlin', 'texas', 'austin',
    'fsd', 'autopilot', 'full self driving', 'full self-driving',
    'earnings', 'quarterly', 'profit', 'revenue',
    'supercharger', 'battery', 'powerwall', 'solar',
    'elon tesla', 'my company', 'our company'
]

In [None]:
# Market close time (4:00 PM Eastern Time)
market_close_time = time(16, 0)

## Load and filter data


In [None]:
print(f"Original rows: {len(musk_posts)}")

Original rows: 55099


In [None]:

# Convert createdAt to datetime
musk_posts['createdAt'] = pd.to_datetime(musk_posts['createdAt'], utc=True)

# Filter by date range
musk_posts = musk_posts[(musk_posts['createdAt'] >= start_date) & (musk_posts['createdAt'] <= end_date)]
print(f"After date filtering: {len(musk_posts)}")


After date filtering: 39428


In [None]:
# Remove retweets (we only want original content)
musk_posts = musk_posts[musk_posts['isRetweet'] == False]
print(f"After removing retweets: {len(musk_posts)}")

After removing retweets: 38739


In [None]:
# Remove rows with empty text
musk_posts = musk_posts[musk_posts['fullText'].notna() & (musk_posts['fullText'].str.strip() != '')]
print(f"After removing empty text: {len(musk_posts)}")

After removing empty text: 38739


## Load FINbert for sentiment analysis

In [None]:
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def get_sentiment_finbert(text, tokenizer, model, max_length=512):
    """
    Get sentiment scores using FinBERT
    Returns: positive, negative, neutral scores and polarity
    """
    # Truncate text if too long
    inputs = tokenizer(text, return_tensors="pt", truncation=True,
                      max_length=max_length, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = softmax(outputs.logits, dim=1)[0]

    # FinBERT outputs: [positive, negative, neutral]
    positive = probs[0].item()
    negative = probs[1].item()
    neutral = probs[2].item()

    # Calculate polarity: positive - negative
    polarity = positive - negative

    return {
        'sentiment_positive': positive,
        'sentiment_negative': negative,
        'sentiment_neutral': neutral,
        'sentiment_polarity': polarity
    }

## Features

In [None]:
# Extract sentiment features for each tweet
def extract_sentiment_features(df, tokenizer, model):
    sentiment_results = []
    for idx, text in enumerate(df['fullText']):
        if idx % 100 == 0:
            print(f"Processing tweet {idx}/{len(df)}")

        sentiment = get_sentiment_finbert(text, tokenizer, model)
        sentiment_results.append(sentiment)

    # Add sentiment columns to dataframe
    sentiment_df = pd.DataFrame(sentiment_results)
    df = pd.concat([df.reset_index(drop=True), sentiment_df], axis=1)

    return df

In [None]:
# Extract timing features: whether tweet was before market close
def extract_timing_features(df):
    # Convert to US Eastern Time
    eastern = pytz.timezone('US/Eastern')
    df['createdAt_ET'] = df['createdAt'].dt.tz_convert(eastern)

    # Extract time component
    df['tweet_time'] = df['createdAt_ET'].dt.time

    # Check if before market close (4:00 PM ET)
    df['before_market_close'] = df['tweet_time'].apply(
        lambda x: 1 if x < market_close_time else 0
    )

    # Additional timing features
    df['hour_ET'] = df['createdAt_ET'].dt.hour
    df['day_of_week'] = df['createdAt_ET'].dt.dayofweek  # 0=Monday, 4=Friday
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    return df

In [None]:
# Check if tweet mentions Tesla or related keywords
def check_tesla_mention(text):
    text_lower = text.lower()

    # Check for any keyword match
    for keyword in tesla_keywords:
        if keyword in text_lower:
            return 1
    return 0

In [None]:
# Count number of Tesla-related keywords in tweet
def count_tesla_keywords(text):
    text_lower = text.lower()
    count = 0
    for keyword in tesla_keywords:
        count += text_lower.count(keyword)
    return count

In [None]:
# Extract Tesla-related mention features
def extract_tesla_features(df):

    df['mentions_tesla'] = df['fullText'].apply(check_tesla_mention)
    df['tesla_keyword_count'] = df['fullText'].apply(count_tesla_keywords)

    print(f"Tweets mentioning Tesla: {df['mentions_tesla'].sum()} ({df['mentions_tesla'].sum()/len(df)*100:.1f}%)")

    return df

In [None]:
# Aggregate tweet-level features to daily level
def aggregate_daily_features(df):

    # Extract date only (in Eastern Time for market alignment)
    df['date'] = df['createdAt_ET'].dt.date

    # Group by date and calculate aggregations
    daily_agg = df.groupby('date').agg({
        # Tweet volume
        'id': 'count',  # total tweet count

        # Sentiment features
        'sentiment_positive': 'mean',
        'sentiment_negative': 'mean',
        'sentiment_neutral': 'mean',
        'sentiment_polarity': ['mean', 'max', 'min', 'std'],

        # Timing features
        'before_market_close': 'sum',  # count of pre-close tweets

        # Tesla mentions
        'mentions_tesla': 'sum',  # count of Tesla-related tweets
        'tesla_keyword_count': 'sum',

        # Engagement metrics (optional)
        'likeCount': 'sum',
        'retweetCount': 'sum',
        'replyCount': 'sum',
        'viewCount': 'sum'
    }).reset_index()

    # Flatten column names properly
    new_columns = []
    for col in daily_agg.columns:
        if isinstance(col, tuple):
            # Join multi-level column names
            new_col = '_'.join([str(c) for c in col if c != ''])
            new_columns.append(new_col)
        else:
            new_columns.append(col)

    daily_agg.columns = new_columns

    # Rename for clarity
    rename_dict = {
        'id_count': 'total_tweet_count',
        'sentiment_positive_mean': 'avg_sentiment_positive',
        'sentiment_negative_mean': 'avg_sentiment_negative',
        'sentiment_neutral_mean': 'avg_sentiment_neutral',
        'sentiment_polarity_mean': 'avg_sentiment_polarity',
        'sentiment_polarity_max': 'max_sentiment_polarity',
        'sentiment_polarity_min': 'min_sentiment_polarity',
        'sentiment_polarity_std': 'std_sentiment_polarity',
        'before_market_close_sum': 'pre_close_tweet_count',
        'mentions_tesla_sum': 'tesla_tweet_count',
        'tesla_keyword_count_sum': 'total_tesla_keywords',
        'likeCount_sum': 'total_likes',
        'retweetCount_sum': 'total_retweets',
        'replyCount_sum': 'total_replies',
        'viewCount_sum': 'total_views'
    }

    daily_agg.rename(columns=rename_dict, inplace=True)

    # Calculate additional derived features
    daily_agg['pct_tesla_tweets'] = (
        daily_agg['tesla_tweet_count'] / daily_agg['total_tweet_count']
    ).fillna(0)

    daily_agg['pct_pre_close_tweets'] = (
        daily_agg['pre_close_tweet_count'] / daily_agg['total_tweet_count']
    ).fillna(0)

    # Weighted sentiment (give more weight to Tesla tweets)
    # You can customize this logic
    df['weighted_sentiment'] = df['sentiment_polarity'] * (1 + df['mentions_tesla'])
    daily_weighted_sentiment = df.groupby('date')['weighted_sentiment'].mean().reset_index()
    daily_agg = daily_agg.merge(daily_weighted_sentiment, on='date', how='left')

    print(f"Total trading days with tweets: {len(daily_agg)}")

    return daily_agg


In [None]:
musk_posts = extract_sentiment_features(musk_posts, tokenizer, model)

Processing tweet 0/38739
Processing tweet 100/38739
Processing tweet 200/38739
Processing tweet 300/38739
Processing tweet 400/38739
Processing tweet 500/38739
Processing tweet 600/38739
Processing tweet 700/38739
Processing tweet 800/38739
Processing tweet 900/38739
Processing tweet 1000/38739
Processing tweet 1100/38739
Processing tweet 1200/38739
Processing tweet 1300/38739
Processing tweet 1400/38739
Processing tweet 1500/38739
Processing tweet 1600/38739
Processing tweet 1700/38739
Processing tweet 1800/38739
Processing tweet 1900/38739
Processing tweet 2000/38739
Processing tweet 2100/38739
Processing tweet 2200/38739
Processing tweet 2300/38739
Processing tweet 2400/38739
Processing tweet 2500/38739
Processing tweet 2600/38739
Processing tweet 2700/38739
Processing tweet 2800/38739
Processing tweet 2900/38739
Processing tweet 3000/38739
Processing tweet 3100/38739
Processing tweet 3200/38739
Processing tweet 3300/38739
Processing tweet 3400/38739
Processing tweet 3500/38739
Proc

In [None]:
musk_posts = extract_timing_features(musk_posts)

In [None]:
musk_posts = extract_tesla_features(musk_posts)

Tweets mentioning Tesla: 3822 (9.9%)


In [None]:
tweet_level_output = "musk_tweets_processed_tweet_level.csv"
musk_posts.to_csv(tweet_level_output, index=False)

In [None]:
daily_musk_posts = aggregate_daily_features(musk_posts)

Total trading days with tweets: 1748


In [None]:
# Create complete date range and fill missing days
def create_complete_time_series(daily_agg, start_date, end_date):

    # Create complete date range
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    complete_df = pd.DataFrame({'date': date_range.date})

    # Merge with aggregated data
    merged_df = complete_df.merge(daily_agg, on='date', how='left')

    # Fill missing values
    # For counts: fill with 0 (no tweets that day)
    count_columns = [col for col in merged_df.columns if 'count' in col.lower() or 'pct' in col.lower()]
    merged_df[count_columns] = merged_df[count_columns].fillna(0)

    # For sentiment: forward fill (use previous day's sentiment) or fill with 0
    sentiment_columns = [col for col in merged_df.columns if 'sentiment' in col.lower()]
    merged_df[sentiment_columns] = merged_df[sentiment_columns].fillna(0)  # or use .fillna(method='ffill')

    # For engagement: fill with 0
    engagement_columns = ['likeCount', 'retweetCount', 'replyCount', 'viewCount']
    existing_engagement = [col for col in engagement_columns if col in merged_df.columns]
    merged_df[existing_engagement] = merged_df[existing_engagement].fillna(0)

    print(f"Complete time series: {len(merged_df)} days")
    print(f"Days with tweets: {(merged_df['total_tweet_count'] > 0).sum()}")
    print(f"Days without tweets: {(merged_df['total_tweet_count'] == 0).sum()}")

    return merged_df

In [None]:
final_musk_posts = create_complete_time_series(daily_musk_posts, start_date, end_date)

Complete time series: 1827 days
Days with tweets: 1748
Days without tweets: 79


In [None]:
output_file = "musk_posts_output.csv"
final_musk_posts.to_csv(output_file, index=False)


In [None]:
print("SUMMARY STATISTICS")
print("+"*70)
print(f"\nDate range: {final_musk_posts['date'].min()} to {final_musk_posts['date'].max()}")
print(f"Total days: {len(final_musk_posts)}")
print(f"Days with tweets: {(final_musk_posts['total_tweet_count'] > 0).sum()}")
print(f"\nAverage tweets per day: {final_musk_posts['total_tweet_count'].mean():.2f}")
print(f"Average sentiment polarity: {final_musk_posts['avg_sentiment_polarity'].mean():.4f}")
print(f"Average Tesla tweets per day: {final_musk_posts['tesla_tweet_count'].mean():.2f}")
print(f"Average pre-close tweets per day: {final_musk_posts['pre_close_tweet_count'].mean():.2f}")

SUMMARY STATISTICS
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Date range: 2020-01-01 to 2024-12-31
Total days: 1827
Days with tweets: 1748

Average tweets per day: 21.20
Average sentiment polarity: 0.0095
Average Tesla tweets per day: 2.09
Average pre-close tweets per day: 13.22


In [None]:
print(final_musk_posts.head(5))

         date  total_tweet_count  avg_sentiment_positive  \
0  2020-01-01                3.0                0.235565   
1  2020-01-02                0.0                0.000000   
2  2020-01-03                3.0                0.108945   
3  2020-01-04                1.0                0.021464   
4  2020-01-05                3.0                0.037397   

   avg_sentiment_negative  avg_sentiment_neutral  avg_sentiment_polarity  \
0                0.096583               0.667852                0.138982   
1                0.000000               0.000000                0.000000   
2                0.022973               0.868082                0.085972   
3                0.038798               0.939738               -0.017335   
4                0.033621               0.928983                0.003776   

   max_sentiment_polarity  min_sentiment_polarity  std_sentiment_polarity  \
0                0.601621               -0.207319                0.416830   
1                0.000000   