In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tnrange, tqdm_notebook, tqdm
from scipy import stats
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.seasonal import seasonal_decompose
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Set style for plots
sns.set(style='whitegrid')


# Load the data
df = pd.read_pickle('tweets_with_labels.pkl')
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,...,joy_prob,love_prob,optimism_prob,pessimism_prob,sadness_prob,surprise_prob,trust_prob,emotion_label,irony_label,sentiment_label
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605.0,4838.0,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,...,0.018759,0.002608,0.014057,0.064718,0.156217,0.087268,0.007338,anticipation,False,negative
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532.0,25483.0,False,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...",...,0.189257,0.002931,0.123739,0.003583,0.00222,0.027095,0.023936,anticipation,False,positive
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332.0,924.0,False,2021-02-10 23:54:48,"Guys evening, I have read this article about B...",...,0.106353,0.002909,0.047832,0.003104,0.002245,0.043926,0.024446,anticipation,False,neutral
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129.0,14.0,False,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,...,0.626578,0.003508,0.301623,0.000384,0.000326,0.003269,0.013409,joy,False,positive
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472.0,10482.0,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,...,0.014121,0.001179,0.562736,0.024811,0.017232,0.005735,0.025889,optimism,False,positive


In [2]:
# Convert 'Date' to datetime type and set as index
df['Date'] = pd.to_datetime(df['date'])
df.set_index('Date', inplace=True)

# Check the prepared dataframe
print(df.head())

                                                    user_name  \
Date                                                            
2021-02-10 23:59:04                             DeSota Wilson   
2021-02-10 23:58:48                                  CryptoND   
2021-02-10 23:54:48                                 Tdlmatias   
2021-02-10 23:54:33                      Crypto is the future   
2021-02-10 23:54:06  Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader   

                       user_location  \
Date                                   
2021-02-10 23:59:04      Atlanta, GA   
2021-02-10 23:58:48              NaN   
2021-02-10 23:54:48  London, England   
2021-02-10 23:54:33              NaN   
2021-02-10 23:54:06           Europa   

                                                      user_description  \
Date                                                                     
2021-02-10 23:59:04  Biz Consultant, real estate, fintech, startups...   
2021-02-10 23:58:48  😎 BITCOINLIVE is a D

In [3]:
df.drop(['user_name', 'user_location', 'user_description', 'user_created', 'date',
       'user_verified', 'text', 'hashtags', 'source', 'irony_prob', 'anger_prob',
       'anticipation_prob', 'disgust_prob', 'fear_prob', 'joy_prob',
       'love_prob', 'optimism_prob', 'pessimism_prob', 'sadness_prob',
       'surprise_prob', 'trust_prob', 'emotion_label', 'irony_label',
       'sentiment_label'], axis = 1, inplace = True)
print(df.head())

                     user_followers  user_friends  user_favourites is_retweet  \
Date                                                                            
2021-02-10 23:59:04          8534.0        7605.0           4838.0      False   
2021-02-10 23:58:48          6769.0        1532.0          25483.0      False   
2021-02-10 23:54:48           128.0         332.0            924.0      False   
2021-02-10 23:54:33           625.0         129.0             14.0      False   
2021-02-10 23:54:06          1249.0        1472.0          10482.0      False   

                     negative_prob  neutral_prob  positive_prob  
Date                                                             
2021-02-10 23:59:04       0.592906      0.397914       0.009180  
2021-02-10 23:58:48       0.001663      0.143908       0.854429  
2021-02-10 23:54:48       0.004100      0.507072       0.488829  
2021-02-10 23:54:33       0.003636      0.106944       0.889420  
2021-02-10 23:54:06       0.008842  

In [4]:
scores = []
for i, s in tqdm(df.iterrows(), total=df.shape[0],position=0, leave=True):
    try:
        #Here, the score emphasizes sentiments that are strongly positive or negative and de-emphasizes those closer to neutral.
        scores.append((s["positive_prob"] - s['negative_prob']) * ((int(s["user_followers"]))) * ((int(s["user_favourites"])+1)/int(s['user_followers']+1)) *((int(s["is_retweet"])+1)))
    except:
        scores.append(np.nan)
df["score"] = scores
df.head(2)

100%|██████████| 4850000/4850000 [02:29<00:00, 32344.22it/s]


Unnamed: 0_level_0,user_followers,user_friends,user_favourites,is_retweet,negative_prob,neutral_prob,positive_prob,score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-02-10 23:59:04,8534.0,7605.0,4838.0,False,0.592906,0.397914,0.00918,-2824.316957
2021-02-10 23:58:48,6769.0,1532.0,25483.0,False,0.001663,0.143908,0.854429,21728.683111


In [15]:
tweets_grouped = df.resample('1h').agg({
    'user_followers': 'mean',
    'user_friends': 'mean',
    'user_favourites': 'mean',
    'negative_prob': 'mean',
    'neutral_prob': 'mean',
    'positive_prob': 'mean',
    'score': 'mean'
})
tweets_grouped['number_of_tweets'] = df.resample('1h').size()
tweets_grouped = tweets_grouped['2021-02-06':'2023-03-05']  # Filters rows from Feb 10 to Feb 11 inclusive
tweets_grouped.head(5)

Unnamed: 0_level_0,user_followers,user_friends,user_favourites,negative_prob,neutral_prob,positive_prob,score,number_of_tweets
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-02-06 00:00:00,2436.357724,585.739837,3569.96748,0.043702,0.469991,0.486307,1654.25385,123
2021-02-06 01:00:00,5323.0625,825.770833,5073.534722,0.062348,0.525353,0.4123,1300.050434,144
2021-02-06 02:00:00,2270.540816,554.785714,3736.653061,0.05879,0.585356,0.355854,506.669396,98
2021-02-06 03:00:00,1852.410526,715.778947,4673.894737,0.023694,0.597381,0.378926,2287.709135,95
2021-02-06 04:00:00,7706.674699,934.240964,4727.939759,0.093185,0.487797,0.419018,1018.53173,83


In [16]:
bitcoin = pd.read_csv('bitcoin_2017_to_2023.csv')
bitcoin['Date'] = pd.to_datetime(bitcoin['timestamp'])
bitcoin.set_index('Date', inplace=True)
print(bitcoin.columns)
# Check the prepared dataframe
bitcoin.drop(['timestamp', 'open', 'low', 'high',
       'quote_asset_volume', 'taker_buy_base_asset_volume',
       'taker_buy_quote_asset_volume'], axis = 1, inplace = True)

crypto_usd_grouped = bitcoin.resample('1h').agg({
    'close': 'mean',
    #'volume': 'mean',
    #'number_of_trades': 'mean'
})
crypto_usd_grouped = crypto_usd_grouped['2021-02-06':'2023-03-05']  # Filters rows from Feb 10 to Feb 11 inclusive
crypto_usd_grouped.head(5)

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume',
       'quote_asset_volume', 'number_of_trades', 'taker_buy_base_asset_volume',
       'taker_buy_quote_asset_volume'],
      dtype='object')


Unnamed: 0_level_0,close
Date,Unnamed: 1_level_1
2021-02-06 00:00:00,38528.798667
2021-02-06 01:00:00,38995.976333
2021-02-06 02:00:00,39094.879333
2021-02-06 03:00:00,39406.951167
2021-02-06 04:00:00,39207.041667


In [42]:
merged_df = pd.merge(tweets_grouped, crypto_usd_grouped, left_index=True, right_index=True, how='outer')
print(merged_df.head(20))
merged_df.to_csv('merged_dataframe_1_hour_score.csv')

                     user_followers  user_friends  user_favourites  \
Date                                                                 
2021-02-06 00:00:00     2436.357724    585.739837      3569.967480   
2021-02-06 01:00:00     5323.062500    825.770833      5073.534722   
2021-02-06 02:00:00     2270.540816    554.785714      3736.653061   
2021-02-06 03:00:00     1852.410526    715.778947      4673.894737   
2021-02-06 04:00:00     7706.674699    934.240964      4727.939759   
2021-02-06 05:00:00     2109.095890    917.890411      5479.986301   
2021-02-06 06:00:00     1789.515464    628.546392      6884.948454   
2021-02-06 07:00:00     1803.870588    519.835294      3451.552941   
2021-02-06 08:00:00     1716.957831    696.204819      6803.162651   
2021-02-06 09:00:00     6322.009174    738.100917      3963.697248   
2021-02-06 10:00:00     2949.556962    734.170886     10190.316456   
2021-02-06 11:00:00     6412.036082   3027.659794      5041.577320   
2021-02-06 12:00:00 