In [55]:
# Import dependencies 
import pandas as pd
import numpy as np
import random
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Set the column width
# pd.set_option('max_colwidth', 200)

In [56]:
# Load the news_articles.csv into a DataFrame.
stock_data_df = pd.read_csv('Resources/stock_yfinance_data.csv')
tweet_data_df = pd.read_csv('Resources/stock_tweets.csv')


In [57]:
# Display the stock data 
stock_data_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA
1,2021-10-01,259.466675,260.26001,254.529999,258.406677,258.406677,51094200,TSLA
2,2021-10-04,265.5,268.98999,258.706665,260.51001,260.51001,91449900,TSLA
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,TSLA
4,2021-10-06,258.733337,262.220001,257.73999,260.916656,260.916656,43898400,TSLA


In [58]:
stock_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        6300 non-null   object 
 1   Open        6300 non-null   float64
 2   High        6300 non-null   float64
 3   Low         6300 non-null   float64
 4   Close       6300 non-null   float64
 5   Adj Close   6300 non-null   float64
 6   Volume      6300 non-null   int64  
 7   Stock Name  6300 non-null   object 
dtypes: float64(5), int64(1), object(2)
memory usage: 393.9+ KB


In [59]:
# Display the tweet data
tweet_data_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,"Mainstream media has done an amazing job at brainwashing people. Today at work, we were asked what companies we believe in &amp; I said @Tesla because they make the safest cars &amp; EVERYONE disa...",TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k from the analysts. $tsla,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,"3/ Even if I include 63.0M unvested RSUs as of 6/30, additional equity needed for the RSUs is 63.0M x $54.20 = $3.4B. If the deal closed tomorrow at $54.20, Elon would need $2.0B for existing shar...",TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,"@RealDanODowd @WholeMarsBlog @Tesla Hahaha why are you still trying to stop Tesla FSD bro! Get your shit together and make something better? Thats how companies work, they competed. Crying big old...",TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids, you sad deranged old man",TSLA,"Tesla, Inc."


In [60]:
tweet_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80793 entries, 0 to 80792
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          80793 non-null  object
 1   Tweet         80793 non-null  object
 2   Stock Name    80793 non-null  object
 3   Company Name  80793 non-null  object
dtypes: object(4)
memory usage: 2.5+ MB


In [61]:
# Remove digits and non-alphabetic characters
# tweet_data_df['Tweet'] = tweet_data_df['Tweet'].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))


In [63]:
# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sentences = tweet_data_df["Tweet"]

# Function to analyze sentiment of each sentence
def analyze_sentiment(sentences):
    return analyzer.polarity_scores(sentences)

# Append the sentiment scores into separate columns
tweet_data_df = pd.concat([tweet_data_df], axis=1)

tweet_data_df.head(20)

Unnamed: 0,Date,Tweet,Stock Name,Company Name,neg,neu,pos,compound
0,2022-09-29 23:41:16+00:00,"Mainstream media has done an amazing job at brainwashing people. Today at work, we were asked what companies we believe in &amp; I said @Tesla because they make the safest cars &amp; EVERYONE disa...",TSLA,"Tesla, Inc.",0.125,0.763,0.113,0.0772
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k from the analysts. $tsla,TSLA,"Tesla, Inc.",0.0,1.0,0.0,0.0
2,2022-09-29 23:18:08+00:00,"3/ Even if I include 63.0M unvested RSUs as of 6/30, additional equity needed for the RSUs is 63.0M x $54.20 = $3.4B. If the deal closed tomorrow at $54.20, Elon would need $2.0B for existing shar...",TSLA,"Tesla, Inc.",0.0,0.949,0.051,0.296
3,2022-09-29 22:40:07+00:00,"@RealDanODowd @WholeMarsBlog @Tesla Hahaha why are you still trying to stop Tesla FSD bro! Get your shit together and make something better? Thats how companies work, they competed. Crying big old...",TSLA,"Tesla, Inc.",0.264,0.597,0.139,-0.7096
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids, you sad deranged old man",TSLA,"Tesla, Inc.",0.526,0.474,0.0,-0.875
5,2022-09-29 22:25:53+00:00,@RealDanODowd @Tesla This is you https://t.co/3Ml1XawSEb,TSLA,"Tesla, Inc.",0.0,1.0,0.0,0.0
6,2022-09-29 22:24:22+00:00,"For years @WholeMarsBlog viciously silenced @Tesla critics. Failing to silence me, he desperately lashes out with childish insults about me, my company, my products - and even my 💩! His fear and ...",TSLA,"Tesla, Inc.",0.331,0.623,0.046,-0.9325
7,2022-09-29 22:23:54+00:00,"$NIO just because I'm down money doesn't mean this is a bad investment. The whole market, everything sucks right now. 2-5 years from now, I'm confident it will pay off. Long the best $AAPL $AMZN $...",TSLA,"Tesla, Inc.",0.158,0.684,0.158,0.25
8,2022-09-29 22:23:28+00:00,50 likes for some $SPY $TSLA charts to study!\n\n❤️,TSLA,"Tesla, Inc.",0.0,0.714,0.286,0.4215
9,2022-09-29 22:15:01+00:00,"@MrJames__321 @KellyRoofing @TeslaSolar @elonmusk @Tesla The powerwalls themselves are waterproof, but what could be bad is any wiring in the house that could be damaged.",TSLA,"Tesla, Inc.",0.272,0.728,0.0,-0.8625
