In [54]:
import numpy as np
import pandas as pd
import json
import datetime
import spacy
from tqdm import tqdm
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px
from coins import get_top_coins


I am opening a JSON file located at './Telegram_Data/result.json' and loading its contents using the JSON module. Then, I am iterating through the keys in the JSON data and printing each key to the console. This code allows me to inspect the keys in the JSON data.

In [55]:
f = open('./Telegram_Data/result.json', encoding='utf8')
data = json.load(f)
for key in data.keys():
    print(key)

name
type
id
messages


In this code cell, I am creating a Pandas DataFrame called 'df' by extracting data from the 'messages' key within the previously loaded JSON data. I limit the DataFrame to the first 2000 rows using the .iloc method. Then, I use the info() method to display information about the DataFrame, such as the data types and non-null counts of each column. Finally, I use the head() method to display the first few rows of the DataFrame for a quick overview of its contents.

In [56]:
df = pd.DataFrame(data['messages'])
df = df.iloc[0:10000]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10000 non-null  int64  
 1   type                 10000 non-null  object 
 2   date                 10000 non-null  object 
 3   from                 9060 non-null   object 
 4   from_id              9986 non-null   object 
 5   reply_to_message_id  5262 non-null   float64
 6   text                 10000 non-null  object 
 7   edited               217 non-null    object 
 8   file                 11 non-null     object 
 9   thumbnail            11 non-null     object 
 10  media_type           11 non-null     object 
 11  sticker_emoji        5 non-null      object 
 12  width                35 non-null     float64
 13  height               35 non-null     float64
 14  forwarded_from       40 non-null     object 
 15  actor                12 non-null     

Unnamed: 0,id,type,date,from,from_id,reply_to_message_id,text,edited,file,thumbnail,...,actor_id,action,members,photo,message_id,contact_information,contact_vcard,mime_type,duration_seconds,via_bot
0,1903163,message,2021-05-01T00:00:14,Seflipe,user1697045867,1903101.0,Double check the address,,,,...,,,,,,,,,,
1,1903164,message,2021-05-01T00:01:33,Seflipe,user1697045867,1903138.0,To get help to understand the product not how ...,,,,...,,,,,,,,,,
2,1903168,message,2021-05-01T00:03:13,Seflipe,user1697045867,1903122.0,You can find good youtube channels.. Check the...,,,,...,,,,,,,,,,
3,1903188,message,2021-05-01T00:13:37,Neto | Crypto.com,user1395411322,1903160.0,"Are you doing withdrawals to another user, DeF...",,,,...,,,,,,,,,,
4,1903191,message,2021-05-01T00:14:10,PauLaLa,user1501886330,,Yes,,,,...,,,,,,,,,,


I'm setting up a language detection component in a spaCy language model. This component allows the model to detect the language of text. It involves defining a function for the language detector, loading a spaCy model for English, and then adding the language detector to the spaCy pipeline. en_core_web_sm model in spacy is a pipe in which many preprocesses are applied (Tokenization, Part-of-Speech Tagging, Dependency Parsing, Named Entity Recognition (NER), Lemmatization, Stop Word Removal)

In [57]:
def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")

if not "language_detector" in Language.factories:
    Language.factory("language_detector", func=get_lang_detector)

nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x21b604fbdc0>

I define a function called extract_features that processes text using spaCy, extracts the detected language, and identifies named entities.
I then apply this function to each text entry in the DataFrame df using a list comprehension and the tqdm library.
The results are stored in the 'language' and 'entities' and 'lemmatized_text' columns of the DataFrame.

In [58]:
def extract_features(x):
    doc = nlp(x)
    lang_dict = doc._.language
    language = lang_dict['language']
    entities = [ent.text for ent in doc.ents]
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return language, entities, lemmatized_text

df['language'], df['entities'], df['lemmatized_text'] = zip(*[extract_features(str(x)) for x in tqdm(df['text'])])

100%|██████████| 10000/10000 [02:02<00:00, 81.50it/s]


In [59]:
df[['text', 'language','entities','lemmatized_text']]

Unnamed: 0,text,language,entities,lemmatized_text
0,Double check the address,en,[],double check the address
1,To get help to understand the product not how ...,en,[],to get help to understand the product not how ...
2,You can find good youtube channels.. Check the...,en,[],you can find good youtube channel .. check the...
3,"Are you doing withdrawals to another user, DeF...",en,"[DeFi Wallet, External]","be you do withdrawal to another user , DeFi Wa..."
4,Yes,tr,[],yes
...,...,...,...,...
9995,??,UNKNOWN,[],? ?
9996,Any help ??,en,[],any help ? ?
9997,Will CRO hit 1$ 🎯..?,en,"[CRO, 1$ 🎯]",will CRO hit 1 $ 🎯 .. ?
9998,//rules,ro,[],//rules


In [60]:
df.language.unique()

array(['en', 'tr', 'it', 'es', 'UNKNOWN', 'nl', 'fi', 'ro', 'sv', 'no',
       'hu', 'fr', 'ca', 'tl', 'da', 'cy', 'so', 'vi', 'af', 'sl', 'sk',
       'sw', 'pl', 'id', 'pt', 'de', 'hr', 'et', 'sq', 'cs', 'lt', 'ar',
       'lv'], dtype=object)

 I'm using a custom function, get_top_coins, to retrieve information about the top 20 coins from the Coinranking API. Then, I'm counting how many times the names and symbols of these coins appear in a text dataset. The code identifies the top 5 coins with the most text mentions and presents them in a DataFrame for further analysis. The purpose is to find the most discussed coins in the text data.

In [61]:
top_coins = get_top_coins(20)

related_counts = {}

for coin in top_coins:
    name_matches = df_new['text'].str.lower().str.contains(coin['name'].lower()).sum()
    symbol_matches = df_new['text'].str.lower().str.contains(coin['symbol'].lower()).sum()
    total_matches = name_matches + symbol_matches
    related_counts[coin['name']] = total_matches

top_5_coins = dict(sorted(related_counts.items(), key=lambda item: item[1], reverse=True)[:5])

top_5_df = pd.DataFrame(top_5_coins.items(), columns=['Coin Name', 'Related Text Count'])

print(top_5_df)

  Coin Name  Related Text Count
0   Bitcoin                  24
1  Dogecoin                  14
2  Ethereum                   2
3       XRP                   2
4      USDC                   2


filter and prepare a dataset for analysis. It starts by selecting English-language text entries and then narrows it down further to include only rows where the text mentions specific cryptocurrencies (Bitcoin, Dogecoin, or BTC which are the tope coins in the above code) and has a minimum length. This filtered dataset is then ready for further analysis, focusing on discussions related to these cryptocurrencies in English text

In [62]:
df_eng = df[df.language.values == 'en']
df_new = df_eng.filter(items = ['id','text','lemmatized_text', 'date'])
df_new = df_new[df_new['text'].str.lower().str.contains('bitcoin|doge|btc') & (df_new['text'].str.len() > 1)]

In [63]:
df_new.head()

Unnamed: 0,id,text,lemmatized_text,date
36,1903358,Bitcoin,bitcoin,2021-05-01T01:04:38
118,1903552,"Thank you! I was just wondering, after factori...","thank you ! I be just wonder , after factor th...",2021-05-01T01:54:32
119,1903555,"The 0.0004 BTC withdrawal fee is flat, and the...","the 0.0004 BTC withdrawal fee be flat , and th...",2021-05-01T01:55:37
123,1903565,For example let’s say I have $1000sgd worth of...,for example let ’s say I have $ 1000sgd worth ...,2021-05-01T02:02:14
125,1903568,Is it the same amount of BTC still?\n\nMinus t...,be it the same amount of BTC still ? \n\n Minu...,2021-05-01T02:03:15


This code defines a function, Vader_senti, that calculates sentiment scores (Negative, Neutral, Positive, and Compound) for text messages using the VADER sentiment analysis tool. Then, it applies this function to each text entry in the 'text' column of the DataFrame df_new and stores the sentiment scores in new columns ('vader_neg', 'vader_neu', 'vader_pos', 'vader_compound') in the same DataFrame. This allows for sentiment analysis of the text data.

In [64]:
sid_obj = SentimentIntensityAnalyzer()

In [65]:
def Vader_senti(x):
    """
    Function to calculate the sentiment of the message x.
    Returns the probability of a given input sentence to be Negative, Neutral, Positive and Compound score.
    
    """
    scores = sid_obj.polarity_scores(x)
    return scores['neg'],scores['neu'],scores['pos'],scores['compound']

df_new[['vader_neg','vader_neu','vader_pos','vader_compound']] = [Vader_senti(x) for x in tqdm(df_new['text'])]
df_new[['vader_neg_lemma','vader_neu_lemma','vader_pos_lemma','vader_compound_lemma']] = [Vader_senti(x) for x in tqdm(df_new['lemmatized_text'])]

100%|██████████| 277/277 [00:00<00:00, 8982.63it/s]
100%|██████████| 277/277 [00:00<00:00, 7855.62it/s]


In [66]:
df_new.head()

Unnamed: 0,id,text,lemmatized_text,date,vader_neg,vader_neu,vader_pos,vader_compound,vader_neg_lemma,vader_neu_lemma,vader_pos_lemma,vader_compound_lemma
36,1903358,Bitcoin,bitcoin,2021-05-01T01:04:38,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
118,1903552,"Thank you! I was just wondering, after factori...","thank you ! I be just wonder , after factor th...",2021-05-01T01:54:32,0.044,0.791,0.165,0.7312,0.028,0.81,0.163,0.7815
119,1903555,"The 0.0004 BTC withdrawal fee is flat, and the...","the 0.0004 BTC withdrawal fee be flat , and th...",2021-05-01T01:55:37,0.073,0.891,0.036,-0.2732,0.066,0.868,0.066,-0.25
123,1903565,For example let’s say I have $1000sgd worth of...,for example let ’s say I have $ 1000sgd worth ...,2021-05-01T02:02:14,0.091,0.741,0.169,-0.0258,0.07,0.795,0.136,-0.0
125,1903568,Is it the same amount of BTC still?\n\nMinus t...,be it the same amount of BTC still ? \n\n Minu...,2021-05-01T02:03:15,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


extract date from the initial dataset

In [67]:
def parse_date(x):
    date_time_obj = datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
    return date_time_obj.date()

df_new['Day'] = [parse_date(x) for x in tqdm(df_new['date'])]

100%|██████████| 277/277 [00:00<00:00, 55414.59it/s]



This code performs the following:

It filters rows in the DataFrame df_new to create two new DataFrames:

df_btc containing rows that mention 'btc' or 'bitcoin.'
df_doge containing rows that mention 'doge.'
It calculates the average sentiment score for the 'vader_compound' column in the df_doge and df_btc DataFrames, grouped by the 'Day' column. It also counts the number of entries in each group.

The code then renames the columns for clarity and prints the results in two separate tables for 'doge' and 'btc' mentions, showing the average sentiment and the count of mentions for each day.

In [69]:
# Filter rows containing 'bitcoin'
df_btc = df_new[df_new['text'].str.lower().str.contains('btc|bitcoin')]

# Filter rows containing 'doge'
df_doge = df_new[df_new['text'].str.lower().str.contains('doge')]
avg_sent = df_doge.groupby(['Day']).agg({'vader_compound' : ['mean', 'count']})
avg_sent.columns = ['_'.join(str(i) for i in col) for col in avg_sent.columns]
avg_sent.reset_index(inplace=True)
avg_sent = avg_sent.rename(columns={'vader_compound_mean': 'doge_mean', 'vader_compound_count': 'doge_count'})
print(avg_sent)

print("-" * 50)
avg_sent2 = df_btc.groupby(['Day']).agg({'vader_compound' : ['mean', 'count']})
avg_sent2.columns = ['_'.join(str(i) for i in col) for col in avg_sent2.columns]
avg_sent2.reset_index(inplace=True)
avg_sent2 = avg_sent2.rename(columns={'vader_compound_mean': 'btc_mean', 'vader_compound_count': 'btc_count'})
print(avg_sent2)

          Day  doge_mean  doge_count
0  2021-05-01   0.231837          30
1  2021-05-02   0.027225           8
2  2021-05-03   0.055925          24
3  2021-05-04   0.074008          66
4  2021-05-05   0.161317          36
--------------------------------------------------
          Day  btc_mean  btc_count
0  2021-05-01  0.197216         32
1  2021-05-02  0.222114         22
2  2021-05-03  0.145005         21
3  2021-05-04  0.116618         33
4  2021-05-05  0.204650         12


Do the exact same thing for lemmatized text

In [78]:
# Filter rows containing 'bitcoin'
df_btc = df_new[df_new['lemmatized_text'].str.lower().str.contains('btc|bitcoin')]

# Filter rows containing 'doge'
df_doge = df_new[df_new['lemmatized_text'].str.lower().str.contains('doge')]
avg_sent_lemma = df_doge.groupby(['Day']).agg({'vader_compound_lemma' : ['mean', 'count']})
avg_sent_lemma.columns = ['_'.join(str(i) for i in col) for col in avg_sent_lemma.columns]
avg_sent_lemma.reset_index(inplace=True)
avg_sent_lemma = avg_sent_lemma.rename(columns={'vader_compound_lemma_mean': 'doge_mean', 'vader_compound_lemma_count': 'doge_count'})
print(avg_sent_lemma)

print("-" * 50)
avg_sent2_lemma = df_btc.groupby(['Day']).agg({'vader_compound_lemma' : ['mean', 'count']})
avg_sent2_lemma.columns = ['_'.join(str(i) for i in col) for col in avg_sent2_lemma.columns]
avg_sent2_lemma.reset_index(inplace=True)
avg_sent2_lemma = avg_sent2_lemma.rename(columns={'vader_compound_lemma_mean': 'btc_mean', 'vader_compound_lemma_count': 'btc_count'})
print(avg_sent2_lemma)

          Day  doge_mean  doge_count
0  2021-05-01   0.221247          30
1  2021-05-02   0.034937           8
2  2021-05-03   0.068312          24
3  2021-05-04   0.071611          66
4  2021-05-05   0.142472          36
--------------------------------------------------
          Day  btc_mean  btc_count
0  2021-05-01  0.271297         32
1  2021-05-02  0.250600         22
2  2021-05-03  0.135481         21
3  2021-05-04  0.123200         33
4  2021-05-05  0.202883         12


I'm adding a 'Sentiment' column to the DataFrames avg_sent and avg_sent2. The 'Sentiment' column classifies each day's sentiment as "Negative" if the average sentiment score is less than 0 and as "Positive" if it's greater than or equal to 0. This categorizes sentiment results into two simple categories: "Negative" and "Positive."

In [79]:
avg_sent['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent['doge_mean']]
avg_sent2['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent2['btc_mean']]

avg_sent_lemma['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent_lemma['doge_mean']]
avg_sent2_lemma['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent2_lemma['btc_mean']]

an output for showing the message count related to each coin in each day

In [80]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)

fig.add_trace(px.histogram(avg_sent, x="Day", y="doge_count", color="Sentiment", nbins=20).data[0], row=1, col=1)

fig.add_trace(px.histogram(avg_sent2, x="Day", y="btc_count", color="Sentiment", nbins=20).data[0], row=1, col=2)

fig.update_layout(
    title_text='Vader Sentiment Analysis Results',
    bargap=0.2, 
    bargroupgap=0.1 
)

fig.update_xaxes(title_text="Doge Coin", row=1, col=1)
fig.update_xaxes(title_text="Bitcoin", row=1, col=2)

fig.show()

for lemmatized text

In [81]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)

fig.add_trace(px.histogram(avg_sent_lemma, x="Day", y="doge_count", color="Sentiment", nbins=20).data[0], row=1, col=1)

fig.add_trace(px.histogram(avg_sent2_lemma, x="Day", y="btc_count", color="Sentiment", nbins=20).data[0], row=1, col=2)

fig.update_layout(
    title_text='Vader Sentiment Analysis Results',
    bargap=0.2, 
    bargroupgap=0.1 
)

fig.update_xaxes(title_text="Doge Coin", row=1, col=1)
fig.update_xaxes(title_text="Bitcoin", row=1, col=2)

fig.show()

overall, lemmatized text act better and more logical in sentiment vader analysis, however the charts illustrate a roughly similar figures.