In [7]:
import glob
import os
# import re

import nltk
import numpy as np
import pandas as pd

# nltk.download("stopwords")
# nltk.download("wordnet")
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [39]:
currency = [
    "BTC",
    #"ETH",
    #"USDT",
    # "XRP",
    #"BCH",
    #"ADA",
    #"BSV",
    #"LTC",
    #"LINK",
    #"BNB",
    #"EOS",
    #"TRON",
]

In [40]:
###############################################################################
# Number of records loaded for BTC 16955
# Number of records loaded for XRP 17622
# TOTAL: 16955 + 17622 = 34667
###############################################################################

class process_tweets():

    def __init__(self, tokenizer=None, stop_words=None, stemmer=None, lemmatizer=None):
        """ 
        Initialize the class.
        """
        self.path = Path(f'{os.getcwd()}')
        self.tokenizer = tokenizer
        self.stop_words = stop_words
        self.stemmer = stemmer
        self.lemmatizer = lemmatizer

        self.df = []
        self.vocabulary = []
        self.final = []

    # Read tweets from CSV for every currency
    def read_tweets(self, curr):
        """
        Read the tweets from the CSV file.
        """
        #initialize the dataframe
        ret = []
        for file in glob.glob(f"{self.path}/*-{curr}*.csv"):
            ret = pd.concat([pd.read_csv(file)], ignore_index=True)
            ret['coin_type'] = curr

        self.df.append(ret)

    def clean_df(self):
        """
        Since I repeted the data mining multiple times, we expect duplicate of tweets.
        Keep the latess mined as the number of followers and retweets can chage.
        """
        for index in range(len(self.df)):
            self.df[index].sort_values(by=["mined_at"], inplace=True, ignore_index=True) 
            self.df[index].drop_duplicates(
                subset=["tweet_id"], inplace=True, keep="last", ignore_index=True
            )

    def process_ccy(self):
        """
        Processes the currency data.
        """
        count = 0
        for index, curr in enumerate(self.df):
            ret = []
            for line in self.df[index]['text']:
                # Initialize stem_text
                stem_text = ""
                for word in self.tokenizer.tokenize(line):
                    count += 1
                    word_lower = word.lower()
                    if word not in self.stop_words:
                        if self.lemmatizer:
                            word_lemmatized = self.lemmatizer.lemmatize(word_lower)
                            stem_text += word_lemmatized + " "
                            self.vocabulary.append(word_lemmatized)
                # Append the stemmed text to the list
                ret.append(stem_text)
            # Create a new column with the stemmed text
            self.df[index]['text_clean'] = np.array(ret)

        print(f"Number of words: {count}")
        print(f"Number of unique words: {len(self.vocabulary)}")

    def getSentiment(self, tweet) -> list:
        """
        Get the sentiment of the tweet.
        """
        analysis = TextBlob(tweet)
        # analyser = SentimentIntensityAnalyzer()
        # sent = analyser.polarity_scores(analysis)

        return analysis.sentiment.polarity
        # return [sent['neg'], sent['neu'], sent['pos'], sent['compound']]

In [41]:
processed = process_tweets(RegexpTokenizer(r'\w+'), 
                           stop_words=stopwords.words('english'), 
                           stemmer=SnowballStemmer("english"), 
                           lemmatizer=WordNetLemmatizer()
                           )
# Read data and concatenate to dataframe
for curr in currency:
    processed.read_tweets(curr)

processed.clean_df()
processed.process_ccy()

Number of words: 388596
Number of unique words: 306797


In [42]:
# processed.df[0]['text'][:10]
# processed.df[0]['text_clean'][:10]

# tweet_id,name,screen_name,retweet_count,text,mined_at,created_at,favourite_count,
# hashtags,status_count,followers_count,location,source_device,retweet_text
for index in range(len(processed.df)):
    # Drop columns that are not needed
    processed.df[index].drop(columns=['tweet_id', 'name', 'screen_name', 'mined_at',                                      'retweet_count', 'favourite_count', 'hashtags', 
                                      'status_count', 'followers_count', 'location', 
                                      'source_device', 'retweet_text'], inplace=True)

In [43]:
for index in range(len(processed.df)):
    processed.df[index]['sentiment'] = processed.df[index]['text_clean'].apply(processed.getSentiment)

# Print head of dataframe
print(processed.df[0].head())

                                                text  \
0  #NEO #btc #ETH \nPullback to the broken resist...   
1  RT @LuckyCorgis: Hey, it's giveaway time!! We'...   
2  RT @ICOAnnouncement: 🎙 New Project\n\n🔷 SolarD...   
3  Feeling like it could finally be my turn to ha...   
4  RT @Blockchainsanta: I mean, it is kinda true ...   

                  created_at coin_type  \
0  2021-11-14 17:22:12+00:00       BTC   
1  2021-11-14 17:22:11+00:00       BTC   
2  2021-11-14 17:22:09+00:00       BTC   
3  2021-11-14 17:22:08+00:00       BTC   
4  2021-11-14 17:22:08+00:00       BTC   

                                          text_clean  sentiment  
0  neo btc eth pullback broken resistance http co...  -0.400000  
1  rt luckycorgis hey giveaway time we giving awa...   0.333333  
2  rt icoannouncement new project solardex solar ...   0.136364  
3  feeling like could finally turn gotten early m...   0.050000  
4  rt blockchainsanta i mean kinda true btc btc h...   0.018750  


In [44]:
# update time format to only take the date
for index in range(len(processed.df)):
    processed.df[index]['date'] = processed.df[index]['created_at'].apply(lambda x: x[:10])
    processed.df[index]['time'] = processed.df[index]['created_at'].apply(lambda x: x[11:19])
    processed.df[index].drop(columns=['created_at'], inplace=True)
    processed.df[index].drop(columns=['text'], inplace=True)
    
print(processed.df[0].head())

  coin_type                                         text_clean  sentiment  \
0       BTC  neo btc eth pullback broken resistance http co...  -0.400000   
1       BTC  rt luckycorgis hey giveaway time we giving awa...   0.333333   
2       BTC  rt icoannouncement new project solardex solar ...   0.136364   
3       BTC  feeling like could finally turn gotten early m...   0.050000   
4       BTC  rt blockchainsanta i mean kinda true btc btc h...   0.018750   

         date      time  
0  2021-11-14  17:22:12  
1  2021-11-14  17:22:11  
2  2021-11-14  17:22:09  
3  2021-11-14  17:22:08  
4  2021-11-14  17:22:08  


### Start working with Bitcoin Market price

In [56]:
from datetime import datetime

def convert_date(x):
    try:
        return datetime.strptime(x, '%Y-%m-%d').date()
    except:
        return None
    # return datetime.strptime(x, '%Y-%m-%d')

ret = []
for curr in currency:
    for file in glob.glob(f"{processed.path}/2022_dataset/{curr}*-Market_Price-*.csv"):
        ret = pd.concat([pd.read_csv(file)], ignore_index=True)
        ret['coin_type'] = curr

# Process data to only include the date and time
ret['time'] = ret['date'].apply(lambda x: x[11:19])
ret['date'] = ret['date'].apply(lambda x: x[:10])

# Change the format of date in ret from dd/mm/yyyy to yyyy-mm-dd
ret['date'] = ret['date'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y").strftime("%Y-%m-%d"))

print(ret.head())

         date     close coin_type   time
0  2022-01-04  46296.06       BTC  00:00
1  2022-01-03  46446.10       BTC  23:00
2  2022-01-03  46217.90       BTC  22:00
3  2022-01-03  45979.01       BTC  21:00
4  2022-01-03  45922.00       BTC  20:00


In [57]:
# Change the order of columns 
market_coin = ret[['coin_type', 'date', 'time', 'close']]
# Change name of column 'close' to 'price'
market_coin.rename(columns={'close': 'price'}, inplace=True)

for index in range(len(processed.df)):
    processed.df[index] = processed.df[index][['coin_type', 'text_clean', 'date', 'time', 'sentiment']]

### Truncating dataframe by hour and then grouping them by hour