In [2]:
# Data handling
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Date & time handling
from datetime import datetime, timedelta

# Natural Language Processing
import re
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Optional: Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\seven\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
stocks_df = pd.read_csv("resources/tesla_stock_data_2000_2025.csv")
stocks_df.head()

Unnamed: 0,Price,Close,High,Low,Open,Volume
0,Ticker,TSLA,TSLA,TSLA,TSLA,TSLA
1,Date,,,,,
2,2010-06-29,1.5926669836044312,1.6666669845581055,1.1693329811096191,1.2666670083999634,281494500
3,2010-06-30,1.5886670351028442,2.0280001163482666,1.553333044052124,1.7193330526351929,257806500
4,2010-07-01,1.4639999866485596,1.7280000448226929,1.3513330221176147,1.6666669845581055,123282000


In [10]:
stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3694 entries, 0 to 3693
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Price   3694 non-null   object
 1   Close   3693 non-null   object
 2   High    3693 non-null   object
 3   Low     3693 non-null   object
 4   Open    3693 non-null   object
 5   Volume  3693 non-null   object
dtypes: object(6)
memory usage: 173.3+ KB


In [11]:
tweets_df = pd.read_csv("resources/musk_quote_tweets.csv")
tweets_df.head()

Unnamed: 0,orig_tweet_id,orig_tweet_created_at,orig_tweet_text,orig_tweet_url,orig_tweet_twitter_url,orig_tweet_username,orig_tweet_retweet_count,orig_tweet_reply_count,orig_tweet_like_count,orig_tweet_quote_count,orig_tweet_view_count,orig_tweet_bookmark_count,musk_tweet_id,musk_quote_tweet,musk_quote_retweet_count,musk_quote_reply_count,musk_quote_like_count,musk_quote_quote_count,musk_quote_view_count,musk_quote_bookmark_count,musk_quote_created_at
0,1655977349530243074,2023-05-09 16:45:41+00:00,Hot take:\n\n@Twitter can become the world's n...,https://x.com/GuyDealership/status/16559773495...,https://twitter.com/GuyDealership/status/16559...,GuyDealership,632,497,7730,123,12218484.0,387,1655978502187778073,Yup,3255,3747,39533,225,11392206.0,281,2023-05-09 16:50:16+00:00
1,1655968201422012418,2023-05-09 16:09:20+00:00,If Legacy Media is going to shove bigotry porn...,https://x.com/TheRabbitHole84/status/165596820...,https://twitter.com/TheRabbitHole84/status/165...,TheRabbitHole84,1409,412,5328,264,14775379.0,633,1655968899903418373,Massive public manipulation,9811,2694,49528,534,14404853.0,1241,2023-05-09 16:12:06+00:00
2,1647327385342320640,2023-04-15 19:53:49+00:00,Starlink provides internet to the most remote ...,https://x.com/teslaownersSV/status/16473273853...,https://twitter.com/teslaownersSV/status/16473...,teslaownersSV,1253,800,8472,175,20877634.0,173,1647339741610926080,💯,8257,4793,106036,376,18520248.0,366,2023-04-15 20:42:55+00:00
3,1646226917387796491,2023-04-12 19:00:57+00:00,National Public Radio Denies Being National Or...,https://x.com/TheBabylonBee/status/16462269173...,https://twitter.com/TheBabylonBee/status/16462...,TheBabylonBee,4781,701,37752,442,18478251.0,167,1646228474628280326,🤣🤣,10198,5076,108462,430,16690340.0,340,2023-04-12 19:07:08+00:00
4,1640016339011076097,2023-03-26 15:42:19+00:00,Arthur C. Clarke about the future of AI. \n—21...,https://x.com/Rainmaker1973/status/16400163390...,https://twitter.com/Rainmaker1973/status/16400...,Rainmaker1973,5409,737,20773,814,27514757.0,3333,1640171198091866114,Prescient,9193,5118,56272,572,25169601.0,3792,2023-03-27 01:57:41+00:00


In [12]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7147 entries, 0 to 7146
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   orig_tweet_id              7147 non-null   int64  
 1   orig_tweet_created_at      7147 non-null   object 
 2   orig_tweet_text            7147 non-null   object 
 3   orig_tweet_url             7147 non-null   object 
 4   orig_tweet_twitter_url     7147 non-null   object 
 5   orig_tweet_username        7147 non-null   object 
 6   orig_tweet_retweet_count   7147 non-null   int64  
 7   orig_tweet_reply_count     7147 non-null   int64  
 8   orig_tweet_like_count      7147 non-null   int64  
 9   orig_tweet_quote_count     7147 non-null   int64  
 10  orig_tweet_view_count      6634 non-null   float64
 11  orig_tweet_bookmark_count  7147 non-null   int64  
 12  musk_tweet_id              7147 non-null   int64  
 13  musk_quote_tweet           7147 non-null   objec

In [13]:
posts_df = pd.read_csv("resources/all_musk_posts.csv")
posts_df.head()

  posts_df = pd.read_csv("resources/all_musk_posts.csv")


Unnamed: 0,id,url,twitterUrl,fullText,retweetCount,replyCount,likeCount,quoteCount,viewCount,createdAt,bookmarkCount,isReply,inReplyToId,conversationId,inReplyToUserId,inReplyToUsername,isPinned,isRetweet,isQuote,isConversationControlled,possiblySensitive,quoteId,quote,retweet
0,1655159652990976000,https://x.com/elonmusk/status/1655159652990976000,https://twitter.com/elonmusk/status/1655159652...,RT @einarvollset: I read @paulg’s “How to Mak...,,,,,,2023-05-07 10:36:27+00:00,,,,,,,,,,,,,,
1,1657261624867299339,https://x.com/elonmusk/status/1657261624867299339,https://twitter.com/elonmusk/status/1657261624...,https://t.co/Zjn6r15lrR,,,,,,2023-05-13 05:48:56+00:00,,,,,,,,,,,,,,
2,1623774484795920384,https://x.com/elonmusk/status/1623774484795920384,https://twitter.com/elonmusk/status/1623774484...,RT @BillyM2k: dude bookmarks are an awesome tw...,,,,,,2023-02-09 20:03:00+00:00,,,,,,,,,,,,,,
3,1656900119202254854,https://x.com/elonmusk/status/1656900119202254854,https://twitter.com/elonmusk/status/1656900119...,Event Horizon Balance Beam,,,,,,2023-05-12 05:52:26+00:00,,,,,,,,,,,,,,
4,1616531874763116544,https://x.com/elonmusk/status/1616531874763116544,https://twitter.com/elonmusk/status/1616531874...,RT @BillyM2k: @elonmusk oh that’s actually pre...,,,,,,2023-01-20 20:23:27+00:00,,,,,,,,,,,,,,


In [14]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54461 entries, 0 to 54460
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        54461 non-null  int64  
 1   url                       54461 non-null  object 
 2   twitterUrl                54461 non-null  object 
 3   fullText                  54461 non-null  object 
 4   retweetCount              54371 non-null  float64
 5   replyCount                53659 non-null  float64
 6   likeCount                 54371 non-null  float64
 7   quoteCount                53633 non-null  float64
 8   viewCount                 33817 non-null  float64
 9   createdAt                 54461 non-null  object 
 10  bookmarkCount             53633 non-null  float64
 11  isReply                   53633 non-null  object 
 12  inReplyToId               39495 non-null  float64
 13  conversationId            53659 non-null  float64
 14  inRepl

In [18]:
# --- STEP 1: Clean stock data and limit by date range ---

# Remove bad rows and manually assign proper column names
stocks_df = stocks_df.iloc[2:].copy()
stocks_df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']

# Convert 'Date' column to datetime
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'])

# --- STEP 2: Clean tweet dates and remove timezone info ---
tweets_df['musk_quote_created_at'] = pd.to_datetime(tweets_df['musk_quote_created_at'])
tweets_df['musk_quote_created_at'] = tweets_df['musk_quote_created_at'].dt.tz_localize(None)

# --- STEP 3: Get tweet date range ---
tweet_start = tweets_df['musk_quote_created_at'].min()
tweet_end = tweets_df['musk_quote_created_at'].max()

# --- STEP 4: Filter stock data to match tweet timeline ---
stocks_df = stocks_df[(stocks_df['Date'] >= tweet_start) & (stocks_df['Date'] <= tweet_end)]

# Check
print(f"Tweet date range: {tweet_start.date()} to {tweet_end.date()}")
stocks_df.head()


Tweet date range: 2015-06-27 to 2025-03-31


Unnamed: 0,Date,Close,High,Low,Open,Volume
1260,2015-06-29,17.468000411987305,17.729999542236328,17.3799991607666,17.463333129882812,52183500
1261,2015-06-30,17.884000778198242,18.06133270263672,17.600000381469727,17.65333366394043,46303500
1262,2015-07-01,17.94333267211914,18.17466735839844,17.856666564941406,18.073999404907227,31518000
1263,2015-07-02,18.667999267578125,18.82999992370605,18.220666885375977,18.68000030517578,107458500
1264,2015-07-06,18.64800071716309,18.779333114624023,18.420000076293945,18.59199905395508,61828500


In [25]:
# Add Volume to the dataset
stocks_df = stocks_df[['Date', 'Open', 'Close', 'Volume']].copy()

# Convert numeric columns
stocks_df[['Open', 'Close', 'Volume']] = stocks_df[['Open', 'Close', 'Volume']].apply(pd.to_numeric, errors='coerce')

# Calculate percent change
stocks_df['pct_change'] = ((stocks_df['Close'] - stocks_df['Open']) / stocks_df['Open']) * 100

# Preview
stocks_df.head()

Unnamed: 0,Date,Open,Close,Volume,pct_change
1260,2015-06-29,17.463333,17.468,52183500,0.026726
1261,2015-06-30,17.653334,17.884001,46303500,1.306649
1262,2015-07-01,18.073999,17.943333,31518000,-0.722954
1263,2015-07-02,18.68,18.667999,107458500,-0.064245
1264,2015-07-06,18.591999,18.648001,61828500,0.301214


In [26]:
stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2434 entries, 1260 to 3693
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        2434 non-null   datetime64[ns]
 1   Open        2434 non-null   float64       
 2   Close       2434 non-null   float64       
 3   Volume      2434 non-null   int64         
 4   pct_change  2434 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 114.1 KB


In [19]:
import re

# Define cleaning function
def clean_tweet(text):
    if pd.isnull(text):
        return ""
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)     # remove @mentions
    text = re.sub(r'#\w+', '', text)     # remove hashtags
    text = re.sub(r'rt[\s]+', '', text)  # remove retweet "RT"
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Apply to the tweet text column
tweets_df['clean_text'] = tweets_df['musk_quote_tweet'].apply(clean_tweet)

# Preview
tweets_df[['musk_quote_tweet', 'clean_text']].head()

Unnamed: 0,musk_quote_tweet,clean_text
0,Yup,yup
1,Massive public manipulation,massive public manipulation
2,💯,
3,🤣🤣,
4,Prescient,prescient


In [20]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7147 entries, 0 to 7146
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   orig_tweet_id              7147 non-null   int64         
 1   orig_tweet_created_at      7147 non-null   object        
 2   orig_tweet_text            7147 non-null   object        
 3   orig_tweet_url             7147 non-null   object        
 4   orig_tweet_twitter_url     7147 non-null   object        
 5   orig_tweet_username        7147 non-null   object        
 6   orig_tweet_retweet_count   7147 non-null   int64         
 7   orig_tweet_reply_count     7147 non-null   int64         
 8   orig_tweet_like_count      7147 non-null   int64         
 9   orig_tweet_quote_count     7147 non-null   int64         
 10  orig_tweet_view_count      6634 non-null   float64       
 11  orig_tweet_bookmark_count  7147 non-null   int64         
 12  musk_t

In [22]:
# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Apply sentiment scoring
tweets_df['sentiment'] = tweets_df['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Optional: Add breakdown if you want more granularity
# tweets_df['neg'] = tweets_df['clean_text'].apply(lambda x: sia.polarity_scores(x)['neg'])
# tweets_df['neu'] = tweets_df['clean_text'].apply(lambda x: sia.polarity_scores(x)['neu'])
# tweets_df['pos'] = tweets_df['clean_text'].apply(lambda x: sia.polarity_scores(x)['pos'])

# Preview result
tweets_df[['clean_text', 'sentiment']].head()

Unnamed: 0,clean_text,sentiment
0,yup,0.0
1,massive public manipulation,-0.296
2,,0.0
3,,0.0
4,prescient,0.0


In [24]:
!pip install transformers torch --quiet

In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch

# Load model + tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create sentiment pipeline
hf_sentiment = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Run on sample of tweets (for speed/testing)
tweets_df['hf_sentiment'] = tweets_df['clean_text'].apply(lambda x: hf_sentiment(x)[0]['label'] if x.strip() != '' else 'NEUTRAL')

# Preview results
tweets_df[['clean_text', 'hf_sentiment']].head()


ImportError: cannot import name 'TypeIs' from 'typing_extensions' (C:\Users\seven\anaconda3\envs\dev\lib\site-packages\typing_extensions.py)