In [2]:
import numpy as np
import pandas as pd
import json
import datetime
import spacy
from tqdm import tqdm
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px


In [3]:
f = open('./Telegram_Data/result.json', encoding='utf8')
data = json.load(f)
for key in data.keys():
    print(key)

name
type
id
messages


In [4]:
df = pd.DataFrame(data['messages'])
df = df.iloc[0:10000]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10000 non-null  int64  
 1   type                 10000 non-null  object 
 2   date                 10000 non-null  object 
 3   from                 9060 non-null   object 
 4   from_id              9986 non-null   object 
 5   reply_to_message_id  5262 non-null   float64
 6   text                 10000 non-null  object 
 7   edited               217 non-null    object 
 8   file                 11 non-null     object 
 9   thumbnail            11 non-null     object 
 10  media_type           11 non-null     object 
 11  sticker_emoji        5 non-null      object 
 12  width                35 non-null     float64
 13  height               35 non-null     float64
 14  forwarded_from       40 non-null     object 
 15  actor                12 non-null     

Unnamed: 0,id,type,date,from,from_id,reply_to_message_id,text,edited,file,thumbnail,...,actor_id,action,members,photo,message_id,contact_information,contact_vcard,mime_type,duration_seconds,via_bot
0,1903163,message,2021-05-01T00:00:14,Seflipe,user1697045867,1903101.0,Double check the address,,,,...,,,,,,,,,,
1,1903164,message,2021-05-01T00:01:33,Seflipe,user1697045867,1903138.0,To get help to understand the product not how ...,,,,...,,,,,,,,,,
2,1903168,message,2021-05-01T00:03:13,Seflipe,user1697045867,1903122.0,You can find good youtube channels.. Check the...,,,,...,,,,,,,,,,
3,1903188,message,2021-05-01T00:13:37,Neto | Crypto.com,user1395411322,1903160.0,"Are you doing withdrawals to another user, DeF...",,,,...,,,,,,,,,,
4,1903191,message,2021-05-01T00:14:10,PauLaLa,user1501886330,,Yes,,,,...,,,,,,,,,,


In [5]:
def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x2b421aeb070>

In [6]:
def detect_lang(x):
    doc = nlp(x)
    lang_dict = doc._.language
    language = lang_dict['language']
    return language
df['language'] = [detect_lang(str(x)) for x in tqdm(df['text'])]

100%|██████████| 10000/10000 [09:18<00:00, 17.90it/s]


In [7]:
df[['text', 'language']].head()

Unnamed: 0,text,language
0,Double check the address,en
1,To get help to understand the product not how ...,en
2,You can find good youtube channels.. Check the...,en
3,"Are you doing withdrawals to another user, DeF...",en
4,Yes,tr


In [8]:
df.language.unique()

array(['en', 'tr', 'es', 'UNKNOWN', 'nl', 'fi', 'ro', 'sv', 'hu', 'ca',
       'it', 'tl', 'da', 'no', 'fr', 'cy', 'so', 'vi', 'af', 'sk', 'sw',
       'pl', 'id', 'pt', 'de', 'hr', 'et', 'sq', 'sl', 'lt', 'cs', 'ar',
       'lv'], dtype=object)

In [9]:
df_eng = df[df.language.values == 'en']
df_new = df_eng.filter(items = ['id','text', 'date'])
df_new = df_new[df_new['text'].str.lower().str.contains('bitcoin|doge|btc') & (df_new['text'].str.len() > 1)]

In [10]:
df_new.head()

Unnamed: 0,id,text,date
36,1903358,Bitcoin,2021-05-01T01:04:38
118,1903552,"Thank you! I was just wondering, after factori...",2021-05-01T01:54:32
119,1903555,"The 0.0004 BTC withdrawal fee is flat, and the...",2021-05-01T01:55:37
123,1903565,For example let’s say I have $1000sgd worth of...,2021-05-01T02:02:14
125,1903568,Is it the same amount of BTC still?\n\nMinus t...,2021-05-01T02:03:15
...,...,...,...
9890,1933160,"Wont be until larger bear, or big money flows ...",2021-05-05T05:18:55
9893,1933167,"If you want to buy to make money, i think it'...",2021-05-05T05:20:49
9897,1933174,"Just look at doge/btc chart week/monthly, and ...",2021-05-05T05:21:50
9952,1933337,should i invest in bitcoin or dogecoin,2021-05-05T05:41:38


In [12]:
sid_obj = SentimentIntensityAnalyzer()

In [13]:
def Vader_senti(x):
    scores = sid_obj.polarity_scores(x)
    return scores['neg'],scores['neu'],scores['pos'],scores['compound']

df_new[['vader_neg','vader_neu','vader_pos','vader_compound']] = [Vader_senti(x) for x in tqdm(df_new['text'])]

100%|██████████| 277/277 [00:00<00:00, 2257.45it/s]


In [15]:
df_new.head()

Unnamed: 0,id,text,date,vader_neg,vader_neu,vader_pos,vader_compound
36,1903358,Bitcoin,2021-05-01T01:04:38,0.0,1.0,0.0,0.0
118,1903552,"Thank you! I was just wondering, after factori...",2021-05-01T01:54:32,0.044,0.791,0.165,0.7312
119,1903555,"The 0.0004 BTC withdrawal fee is flat, and the...",2021-05-01T01:55:37,0.073,0.891,0.036,-0.2732
123,1903565,For example let’s say I have $1000sgd worth of...,2021-05-01T02:02:14,0.091,0.741,0.169,-0.0258
125,1903568,Is it the same amount of BTC still?\n\nMinus t...,2021-05-01T02:03:15,0.0,1.0,0.0,0.0


In [19]:
def parse_date(x):
    """
    Function to extract date from the initial dataset
    """
    date_time_obj = datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
    return date_time_obj.date()

df_new['Day'] = [parse_date(x) for x in tqdm(df_new['date'])]

100%|██████████| 277/277 [00:00<00:00, 30772.67it/s]


In [21]:
# Filter rows containing 'bitcoin'
df_btc = df_new[df_new['text'].str.lower().str.contains('btc|bitcoin')]

# Filter rows containing 'doge'
df_doge = df_new[df_new['text'].str.lower().str.contains('doge')]
avg_sent = df_doge.groupby(['Day']).agg({'vader_compound' : ['mean', 'count']})
avg_sent.columns = ['_'.join(str(i) for i in col) for col in avg_sent.columns]
avg_sent.reset_index(inplace=True)
avg_sent = avg_sent.rename(columns={'vader_compound_mean': 'doge_mean', 'vader_compound_count': 'doge_count'})
print(avg_sent)

print("-" * 50)
avg_sent2 = df_btc.groupby(['Day']).agg({'vader_compound' : ['mean', 'count']})
avg_sent2.columns = ['_'.join(str(i) for i in col) for col in avg_sent2.columns]
avg_sent2.reset_index(inplace=True)
avg_sent2 = avg_sent2.rename(columns={'vader_compound_mean': 'btc_mean', 'vader_compound_count': 'btc_count'})
print(avg_sent2)

          Day  doge_mean  doge_count
0  2021-05-01   0.224358          31
1  2021-05-02   0.020086           7
2  2021-05-03   0.055925          24
3  2021-05-04   0.074008          66
4  2021-05-05   0.165274          35
--------------------------------------------------
          Day  btc_mean  btc_count
0  2021-05-01  0.209533         33
1  2021-05-02  0.222114         22
2  2021-05-03  0.145005         21
3  2021-05-04  0.116618         33
4  2021-05-05  0.204650         12


In [23]:
avg_sent['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent['doge_mean']]
avg_sent2['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent2['btc_mean']]

In [38]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)

fig.add_trace(px.histogram(avg_sent, x="Day", y="doge_count", color="Sentiment", nbins=20).data[0], row=1, col=1)

fig.add_trace(px.histogram(avg_sent2, x="Day", y="btc_count", color="Sentiment", nbins=20).data[0], row=1, col=2)

fig.update_layout(
    title_text='Vader Sentiment Analysis Results',
    bargap=0.2, 
    bargroupgap=0.1 
)

fig.update_xaxes(title_text="Doge Coin", row=1, col=1)
fig.update_xaxes(title_text="Bitcoin", row=1, col=2)

fig.show()