In [59]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kille\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
stock_tweets_df = pd.read_csv('Resources/stock_tweets.csv')
stock_tweets_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc."
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc."
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc."
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc."
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc."


In [3]:
stock_yfinance_df = pd.read_csv('Resources/stock_yfinance_data.csv')
stock_yfinance_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Stock Name
0,2021-09-30,260.333344,263.043335,258.333344,258.493347,258.493347,53868000,TSLA
1,2021-10-01,259.466675,260.26001,254.529999,258.406677,258.406677,51094200,TSLA
2,2021-10-04,265.5,268.98999,258.706665,260.51001,260.51001,91449900,TSLA
3,2021-10-05,261.600006,265.769989,258.066681,260.196655,260.196655,55297800,TSLA
4,2021-10-06,258.733337,262.220001,257.73999,260.916656,260.916656,43898400,TSLA


In [4]:
def get_date(date):
    return date[0:10]

In [5]:
stock_tweets_df['Datetime'] = stock_tweets_df['Date'].apply(get_date)
stock_tweets_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Datetime
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",2022-09-29
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",2022-09-29
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",2022-09-29
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",2022-09-29
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",2022-09-29


In [6]:
def get_score(tweet):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(tweet)['compound']
    return score

In [7]:
stock_tweets_df['Sentiment Score'] = stock_tweets_df['Tweet'].apply(get_score)

In [8]:
date_stock_df = stock_tweets_df.groupby(['Datetime','Stock Name']).size().reset_index(name='Tweet Count')
date_stock_df

Unnamed: 0,Datetime,Stock Name,Tweet Count
0,2021-09-30,AAPL,7
1,2021-09-30,AMD,6
2,2021-09-30,AMZN,5
3,2021-09-30,DIS,1
4,2021-09-30,GOOG,1
...,...,...,...
5905,2022-09-29,PG,10
5906,2022-09-29,PYPL,1
5907,2022-09-29,TSLA,112
5908,2022-09-29,TSM,16


In [9]:
stock_tweets_df = pd.merge(stock_tweets_df,date_stock_df,
                           on=['Datetime','Stock Name'],
                           how='inner'
                           )
stock_tweets_df.head()

Unnamed: 0,Date,Tweet,Stock Name,Company Name,Datetime,Sentiment Score,Tweet Count
0,2022-09-29 23:41:16+00:00,Mainstream media has done an amazing job at br...,TSLA,"Tesla, Inc.",2022-09-29,0.0772,112
1,2022-09-29 23:24:43+00:00,Tesla delivery estimates are at around 364k fr...,TSLA,"Tesla, Inc.",2022-09-29,0.0,112
2,2022-09-29 23:18:08+00:00,3/ Even if I include 63.0M unvested RSUs as of...,TSLA,"Tesla, Inc.",2022-09-29,0.296,112
3,2022-09-29 22:40:07+00:00,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...,TSLA,"Tesla, Inc.",2022-09-29,-0.7568,112
4,2022-09-29 22:27:05+00:00,"@RealDanODowd @Tesla Stop trying to kill kids,...",TSLA,"Tesla, Inc.",2022-09-29,-0.875,112


In [10]:
stock_yfinance_df['Open/Close Diff'] = round(abs(stock_yfinance_df['Open'] - stock_yfinance_df['Close']),2)

In [11]:
stock_yfinance_df['Prev Close Diff'] = stock_yfinance_df['Close'].diff()

In [12]:
stock_yfinance_df = stock_yfinance_df.rename(columns={'Date':'Datetime'})

In [13]:
stock_df = pd.merge(stock_tweets_df,stock_yfinance_df,on=['Datetime','Stock Name'])

In [14]:
close_df = stock_df['Close']
stock_df = stock_df.drop(columns=['Date','Datetime','Tweet','Close'])

In [52]:
categorical_df = stock_df.select_dtypes('object')
numerical_df = stock_df.select_dtypes(['int64','float64'])

In [65]:
ct = ColumnTransformer([('scaler',StandardScaler(),numerical_df.columns)])
array = ct.fit_transform(numerical_df)
scaler = MinMaxScaler(feature_range=(0,1))
array = scaler.fit_transform(array)
numerical_df = pd.DataFrame(data=array,columns=ct.get_feature_names_out())

In [73]:
categorical_df = pd.get_dummies(data=categorical_df,dtype=int)


In [67]:
final_stock_df = pd.concat([categorical_df,numerical_df,close_df],axis=1)

In [68]:
final_stock_df

Unnamed: 0,Stock Name_AAPL,Stock Name_AMD,Stock Name_AMZN,Stock Name_BA,Stock Name_BX,Stock Name_COST,Stock Name_CRM,Stock Name_DIS,Stock Name_ENPH,Stock Name_F,...,scaler__scaler__Sentiment Score,scaler__scaler__Tweet Count,scaler__scaler__Open,scaler__scaler__High,scaler__scaler__Low,scaler__scaler__Adj Close,scaler__scaler__Volume,scaler__scaler__Open/Close Diff,scaler__scaler__Prev Close Diff,Close
0,0,0,0,0,0,0,0,0,0,0,...,0.538566,0.240781,0.398767,0.394835,0.377466,0.377952,0.283144,0.275307,0.373179,268.209991
1,0,0,0,0,0,0,0,0,0,0,...,0.499647,0.240781,0.398767,0.394835,0.377466,0.377952,0.283144,0.275307,0.373179,268.209991
2,0,0,0,0,0,0,0,0,0,0,...,0.648871,0.240781,0.398767,0.394835,0.377466,0.377952,0.283144,0.275307,0.373179,268.209991
3,0,0,0,0,0,0,0,0,0,0,...,0.118119,0.240781,0.398767,0.394835,0.377466,0.377952,0.283144,0.275307,0.373179,268.209991
4,0,0,0,0,0,0,0,0,0,0,...,0.058530,0.240781,0.398767,0.394835,0.377466,0.377952,0.283144,0.275307,0.373179,268.209991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63671,0,0,0,0,0,0,0,0,0,0,...,0.589333,0.000000,0.037872,0.039905,0.038376,0.040180,0.044713,0.025733,0.401610,38.259998
63672,0,0,0,0,0,0,0,0,0,0,...,0.692125,0.000000,0.035347,0.034569,0.033459,0.034084,0.021957,0.020246,0.396646,34.110001
63673,0,0,0,0,0,0,0,0,0,0,...,0.712140,0.002169,0.037651,0.036831,0.035769,0.036067,0.026417,0.024409,0.398279,35.459999
63674,0,0,0,0,0,0,0,0,0,0,...,0.773039,0.002169,0.037651,0.036831,0.035769,0.036067,0.026417,0.024409,0.398279,35.459999


In [75]:
final_stock_df = final_stock_df.fillna(0.0)

In [76]:
final_stock_df.to_csv('Resources/final_stock_data.csv',index=False)