In [69]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.colors import ListedColormap
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn import metrics


pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)

In [70]:
dfBTC = pd.read_csv (r'..\Data\crypto\bitcoinAll_1HR_Cleaned.csv')
tweetDf = pd.read_csv(r'..\Data\tweets\elon_archive_cleaned.csv')

In [71]:
tweetDf.head(5)

Unnamed: 0.1,Unnamed: 0,date,time,username,name,tweet,month,year,day,hour,minute,second,tweet_length,crypto_related
0,0,2021-11-14,17:48:31,elonmusk,Elon Musk,@ConsumerReports 👍,11,2021,14,17,48,31,18,False
1,1,2021-11-14,15:35:40,elonmusk,Elon Musk,@WholeMarsBlog Exactly,11,2021,14,15,35,40,22,False
2,2,2021-11-14,15:33:04,elonmusk,Elon Musk,@realTylerZed @SenSanders 🤣,11,2021,14,15,33,4,27,False
3,3,2021-11-14,09:24:02,elonmusk,Elon Musk,"@SenSanders Want me to sell more stock, Bernie...",11,2021,14,9,24,2,67,False
4,4,2021-11-14,08:39:18,elonmusk,Elon Musk,"@fermatslibrary Soon, that list will grow much...",11,2021,14,8,39,18,86,False


## Add column for relation to tweets

In [72]:
df = dfBTC
df['Crypto_Related'] = 0

In [73]:
relatedDf = tweetDf[tweetDf['crypto_related'] == True]

In [74]:
relatedDf.head()

Unnamed: 0.1,Unnamed: 0,date,time,username,name,tweet,month,year,day,hour,minute,second,tweet_length,crypto_related
72,72,2021-11-05,09:46:20,elonmusk,Elon Musk,@dogeofficialceo @BillyM2k 🤣,11,2021,5,9,46,20,28,True
92,92,2021-11-02,15:36:20,elonmusk,Elon Musk,@BillyM2k How about my lil doge Floki? He woul...,11,2021,2,15,36,20,84,True
110,110,2021-10-31,13:20:50,elonmusk,Elon Musk,Tuition is in Dogecoin &amp; u get a discount ...,10,2021,31,13,20,50,61,True
141,141,2021-10-27,18:16:36,elonmusk,Elon Musk,@dogeofficialceo @wapodavenport 🤣,10,2021,27,18,16,36,33,True
150,150,2021-10-27,12:47:54,elonmusk,Elon Musk,"@BillyM2k If I send you 2 Doge, will you promi...",10,2021,27,12,47,54,67,True


In [75]:
# for every row in related tweet dataframe, 
# if date exists in bitcoin price dataframe,
# have cryptorelated column equal to 1
for _, row in relatedDf.iterrows():
    df.loc[df['Date'] == row['date'], 'Crypto_Related'] = 1

In [76]:
# drop column that came from who knows where
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [77]:
df['Crypto_Related'].unique()
df.head()

Unnamed: 0,rate_open,rate_high,rate_low,rate_close,Date,hour,year,month,day,Crypto_Related
0,432.03988,432.079724,430.174245,430.83163,2016-01-01,0,2016,1,1,0
1,430.82104,431.181942,429.670276,430.378803,2016-01-01,1,2016,1,1,0
2,430.383656,431.237817,430.325129,430.608821,2016-01-01,2,2016,1,1,0
3,430.594263,432.903554,430.330866,432.903554,2016-01-01,3,2016,1,1,0
4,433.020607,436.372214,432.770797,435.60285,2016-01-01,4,2016,1,1,0


In [78]:
# start separating data for training and testing
data = np.array(df)
target = np.array(df['Crypto_Related'])
X_data = data[:,:4]
print(X_data)

[[432.0398799322916 432.0797235201113 430.1742451974526 430.8316296728032]
 [430.821040368554 431.1819424971637 429.67027647988857 430.3788032192657]
 [430.3836558932034 431.2378173674194 430.3251291538263 430.6088213915016]
 ...
 [59659.82659117507 60925.97718398709 59659.82659117507 60474.68898271314]
 [60507.27061641579 61053.38923896875 60487.19647186807 60622.95573548197]
 [60623.40066780722 60649.79528834127 60140.94737361382 60140.94737361382]]


In [79]:
# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_data, target, test_size=0.2)

In [99]:
# model regression
model3 = LinearRegression()
# fit data
model3.fit(X_train, y_train)
# y prediction
y_prediction = model3.predict(X_test)

# some results, but looks like we don't have enough data collected with it being cryptorelated
print(metrics.classification_report(y_test, y_prediction.round(), zero_division=1))
print(metrics.confusion_matrix(y_test, y_prediction.round()))




              precision    recall  f1-score   support

           0       0.96      1.00      0.98      9884
           1       1.00      0.00      0.00       422

    accuracy                           0.96     10306
   macro avg       0.98      0.50      0.49     10306
weighted avg       0.96      0.96      0.94     10306

[[9884    0]
 [ 422    0]]
