In [134]:
import glob
import os
# import re

import nltk
import numpy as np
import pandas as pd

# nltk.download("stopwords")
# nltk.download("wordnet")
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from textblob import TextBlob

In [11]:
currency = [
    "BTC",
    #"ETH",
    #"USDT",
    "XRP",
    #"BCH",
    #"ADA",
    #"BSV",
    #"LTC",
    #"LINK",
    #"BNB",
    #"EOS",
    #"TRON",
]

In [156]:
###############################################################################
# Number of records loaded for BTC 16955
# Number of records loaded for XRP 17622
# TOTAL: 16955 + 17622 = 34667
###############################################################################

class process_tweets():

    def __init__(self, tokenizer=None, stop_words=None, stemmer=None, lemmatizer=None):
        """ 
        Initialize the class.
        """
        self.path = Path(f'{os.getcwd()}')
        self.tokenizer = tokenizer
        self.stop_words = stop_words
        self.stemmer = stemmer
        self.lemmatizer = lemmatizer

        self.df = []
        self.vocabulary = []
        # self.sentiment = []

    # Read tweets from CSV for every currency
    def read_tweets(self, curr):
        """
        Read the tweets from the CSV file.
        """
        #initialize the dataframe
        ret = []
        for file in glob.glob(f"{self.path}/*-{curr}*.csv"):
            ret = pd.concat([pd.read_csv(file)], ignore_index=True)
            ret['coin_type'] = curr

        self.df.append(ret)

    def clean_df(self):
        """
        Since I repeted the data mining multiple times, we expect duplicate of tweets.
        Keep the latess mined as the number of followers and retweets can chage.
        """
        for index in range(len(self.df)):
            self.df[index].sort_values(by=["mined_at"], inplace=True, ignore_index=True) 
            self.df[index].drop_duplicates(
                subset=["tweet_id"], inplace=True, keep="last", ignore_index=True
            )

    def process_ccy(self):
        """
        Processes the currency data.
        """
        count = 0
        for index, curr in enumerate(self.df):
            ret = []
            for line in self.df[index]['text']:
                # Initialize stem_text
                stem_text = ""
                for word in self.tokenizer.tokenize(line):
                    count += 1
                    word_lower = word.lower()
                    if word not in self.stop_words:
                        if self.lemmatizer:
                            word_lemmatized = self.lemmatizer.lemmatize(word_lower)
                            stem_text += word_lemmatized + " "
                            self.vocabulary.append(word_lemmatized)
                # Append the stemmed text to the list
                ret.append(stem_text)
            # Create a new column with the stemmed text
            self.df[index]['text_clean'] = np.array(ret)

        print(f"Number of words: {count}")
        print(f"Number of unique words: {len(self.vocabulary)}")
    
    def getSentiment(self, tweet):
        """
        Get the sentiment of the tweet.
        """
        analysis = TextBlob(tweet)

        return analysis.sentiment.polarity

In [157]:
processed = process_tweets(RegexpTokenizer(r'\w+'), 
                           stop_words=stopwords.words('english'), 
                           stemmer=SnowballStemmer("english"), 
                           lemmatizer=WordNetLemmatizer()
                           )
# Read data and concatenate to dataframe
for curr in currency:
    processed.read_tweets(curr)

processed.clean_df()
processed.process_ccy()

Number of words: 769267
Number of unique words: 591040


In [150]:
# processed.df[0]['text'][:10]
# processed.df[0]['text_clean'][:10]

In [158]:
for index in range(len(processed.df)):
    processed.df[index]['sentiment'] = processed.df[index]['text_clean'].apply(processed.getSentiment)

# Print head of dataframe
print(processed.df[0].head())


              tweet_id                               name      screen_name  \
0  1459934703109218308  codax.investing〽️                  codax_investing   
1  1459934698927538181  Dhekacok 🐃,🐋 #SOL #TORG #TORGARMY  dikachok          
2  1459934690178220038  Mohammad borchlo                   borchlo           
3  1459934686801801218  fs                                 lmjsara           
4  1459934686743040001  Filip                              Filip_3233        

   retweet_count  \
0  0               
1  675             
2  4586            
3  0               
4  694             

                                                                                                                                                                                                                                                                                          text  \
0  #NEO #btc #ETH \nPullback to the broken resistance https://t.co/DiQ3eJIRg6                                           