In [1]:
import numpy as np
import pandas as pd
import json
import datetime
import spacy
from tqdm import tqdm
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.express as px
from coins import get_top_coins




I am opening a JSON file located at './Telegram_Data/result.json' and loading its contents using the JSON module. Then, I am iterating through the keys in the JSON data and printing each key to the console. This code allows me to inspect the keys in the JSON data.

In [None]:
f = open('./Telegram_Data/result.json', encoding='utf8')
data = json.load(f)
for key in data.keys():
    print(key)

In this code cell, I am creating a Pandas DataFrame called 'df' by extracting data from the 'messages' key within the previously loaded JSON data. I limit the DataFrame to the first 2000 rows using the .iloc method. Then, I use the info() method to display information about the DataFrame, such as the data types and non-null counts of each column. Finally, I use the head() method to display the first few rows of the DataFrame for a quick overview of its contents.

In [None]:
df = pd.DataFrame(data['messages'])
df = df.iloc[0:10000]
df.info()
df.head()

I'm setting up a language detection component in a spaCy language model. This component allows the model to detect the language of text. It involves defining a function for the language detector, loading a spaCy model for English, and then adding the language detector to the spaCy pipeline. en_core_web_sm model in spacy is a pipe in which many preprocesses are applied (Tokenization, Part-of-Speech Tagging, Dependency Parsing, Named Entity Recognition (NER), Lemmatization, Stop Word Removal)

In [None]:
def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")

if not "language_detector" in Language.factories:
    Language.factory("language_detector", func=get_lang_detector)

nlp.add_pipe('language_detector', last=True)

I define a function called extract_features that processes text using spaCy, extracts the detected language, and identifies named entities.
I then apply this function to each text entry in the DataFrame df using a list comprehension and the tqdm library.
The results are stored in the 'language' and 'entities' and 'lemmatized_text' columns of the DataFrame.

In [None]:
def extract_features(x):
    doc = nlp(x)
    lang_dict = doc._.language
    language = lang_dict['language']
    entities = [ent.text for ent in doc.ents]
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return language, entities, lemmatized_text

df['language'], df['entities'], df['lemmatized_text'] = zip(*[extract_features(str(x)) for x in tqdm(df['text'])])

In [None]:
df[['text', 'language','entities','lemmatized_text']]

In [None]:
df.language.unique()

 I'm using a custom function, get_top_coins, to retrieve information about the top 20 coins from the Coinranking API. Then, I'm counting how many times the names and symbols of these coins appear in a text dataset. The code identifies the top 5 coins with the most text mentions and presents them in a DataFrame for further analysis. The purpose is to find the most discussed coins in the text data.

In [None]:
top_coins = get_top_coins(20)

related_counts = {}

for coin in top_coins:
    name_matches = df_new['text'].str.lower().str.contains(coin['name'].lower()).sum()
    symbol_matches = df_new['text'].str.lower().str.contains(coin['symbol'].lower()).sum()
    total_matches = name_matches + symbol_matches
    related_counts[coin['name']] = total_matches

top_5_coins = dict(sorted(related_counts.items(), key=lambda item: item[1], reverse=True)[:5])

top_5_df = pd.DataFrame(top_5_coins.items(), columns=['Coin Name', 'Related Text Count'])

print(top_5_df)

filter and prepare a dataset for analysis. It starts by selecting English-language text entries and then narrows it down further to include only rows where the text mentions specific cryptocurrencies (Bitcoin, Dogecoin, or BTC which are the tope coins in the above code) and has a minimum length. This filtered dataset is then ready for further analysis, focusing on discussions related to these cryptocurrencies in English text

In [None]:
df_eng = df[df.language.values == 'en']
df_new = df_eng.filter(items = ['id','text','lemmatized_text', 'date'])


In [None]:
def get_coin_sentiment(cleaned_data,from_date, to_date, coin_name,coin_symbol):
    def Vader_senti(x):
        """
        Function to calculate the sentiment of the message x.
        Returns the probability of a given input sentence to be Negative, Neutral, Positive and Compound score.
        
        """
        scores = sid_obj.polarity_scores(x)
        return scores['neg'],scores['neu'],scores['pos'],scores['compound']
    
    data = cleaned_data[cleaned_data['text'].str.lower().str.contains(coin_name|coin_symbol) & (cleaned_data['text'].str.len() > 1)]
    data = data[(data['date'] >= from_date) & (data['date'] <= to_date)]
    sid_obj = SentimentIntensityAnalyzer()
    df_new[['vader_neg','vader_neu','vader_pos','vader_compound']] = [Vader_senti(x) for x in tqdm(df_new['text'])]
    
    def parse_date(x):
        date_time_obj = datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
        return date_time_obj.date()

    
    df_new['Day'] = [parse_date(x) for x in tqdm(df_new['date'])]
    avg_sent = df_new.groupby(['Day']).agg({'vader_compound' : ['mean', 'count']})
    avg_sent.columns = ['_'.join(str(i) for i in col) for col in avg_sent.columns]
    avg_sent.reset_index(inplace=True)
    avg_sent = avg_sent.rename(columns={'vader_compound_mean': f'{coin_name}_mean', 'vader_compound_count': f'{coin_name}_count'})
    return avg_sent





In [None]:
df_new = df_new[df_new['text'].str.lower().str.contains('bitcoin|doge|btc') & (df_new['text'].str.len() > 1)]
df_new.head()

This code defines a function, Vader_senti, that calculates sentiment scores (Negative, Neutral, Positive, and Compound) for text messages using the VADER sentiment analysis tool. Then, it applies this function to each text entry in the 'text' column of the DataFrame df_new and stores the sentiment scores in new columns ('vader_neg', 'vader_neu', 'vader_pos', 'vader_compound') in the same DataFrame. This allows for sentiment analysis of the text data.

In [None]:
sid_obj = SentimentIntensityAnalyzer()

In [1]:
def Vader_senti(x):
    """
    Function to calculate the sentiment of the message x.
    Returns the probability of a given input sentence to be Negative, Neutral, Positive and Compound score.
    
    """
    scores = sid_obj.polarity_scores(x)
    return scores['neg'],scores['neu'],scores['pos'],scores['compound']

df_new[['vader_neg','vader_neu','vader_pos','vader_compound']] = [Vader_senti(x) for x in tqdm(df_new['text'])]
df_new[['vader_neg_lemma','vader_neu_lemma','vader_pos_lemma','vader_compound_lemma']] = [Vader_senti(x) for x in tqdm(df_new['lemmatized_text'])]

NameError: name 'tqdm' is not defined

In [None]:
df_new.head()

extract date from the initial dataset

In [None]:
def parse_date(x):
    date_time_obj = datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S')
    return date_time_obj.date()

df_new['Day'] = [parse_date(x) for x in tqdm(df_new['date'])]


This code performs the following:

It filters rows in the DataFrame df_new to create two new DataFrames:

df_btc containing rows that mention 'btc' or 'bitcoin.'
df_doge containing rows that mention 'doge.'
It calculates the average sentiment score for the 'vader_compound' column in the df_doge and df_btc DataFrames, grouped by the 'Day' column. It also counts the number of entries in each group.

The code then renames the columns for clarity and prints the results in two separate tables for 'doge' and 'btc' mentions, showing the average sentiment and the count of mentions for each day.

In [None]:
# Filter rows containing 'bitcoin'
df_btc = df_new[df_new['text'].str.lower().str.contains('btc|bitcoin')]

# Filter rows containing 'doge'
df_doge = df_new[df_new['text'].str.lower().str.contains('doge')]
avg_sent = df_doge.groupby(['Day']).agg({'vader_compound' : ['mean', 'count']})
avg_sent.columns = ['_'.join(str(i) for i in col) for col in avg_sent.columns]
avg_sent.reset_index(inplace=True)
avg_sent = avg_sent.rename(columns={'vader_compound_mean': 'doge_mean', 'vader_compound_count': 'doge_count'})
print(avg_sent)

print("-" * 50)
avg_sent2 = df_btc.groupby(['Day']).agg({'vader_compound' : ['mean', 'count']})
avg_sent2.columns = ['_'.join(str(i) for i in col) for col in avg_sent2.columns]
avg_sent2.reset_index(inplace=True)
avg_sent2 = avg_sent2.rename(columns={'vader_compound_mean': 'btc_mean', 'vader_compound_count': 'btc_count'})
print(avg_sent2)

Do the exact same thing for lemmatized text

In [None]:
# Filter rows containing 'bitcoin'
df_btc = df_new[df_new['lemmatized_text'].str.lower().str.contains('btc|bitcoin')]

# Filter rows containing 'doge'
df_doge = df_new[df_new['lemmatized_text'].str.lower().str.contains('doge')]
avg_sent_lemma = df_doge.groupby(['Day']).agg({'vader_compound_lemma' : ['mean', 'count']})
avg_sent_lemma.columns = ['_'.join(str(i) for i in col) for col in avg_sent_lemma.columns]
avg_sent_lemma.reset_index(inplace=True)
avg_sent_lemma = avg_sent_lemma.rename(columns={'vader_compound_lemma_mean': 'doge_mean', 'vader_compound_lemma_count': 'doge_count'})
print(avg_sent_lemma)

print("-" * 50)
avg_sent2_lemma = df_btc.groupby(['Day']).agg({'vader_compound_lemma' : ['mean', 'count']})
avg_sent2_lemma.columns = ['_'.join(str(i) for i in col) for col in avg_sent2_lemma.columns]
avg_sent2_lemma.reset_index(inplace=True)
avg_sent2_lemma = avg_sent2_lemma.rename(columns={'vader_compound_lemma_mean': 'btc_mean', 'vader_compound_lemma_count': 'btc_count'})
print(avg_sent2_lemma)

I'm adding a 'Sentiment' column to the DataFrames avg_sent and avg_sent2. The 'Sentiment' column classifies each day's sentiment as "Negative" if the average sentiment score is less than 0 and as "Positive" if it's greater than or equal to 0. This categorizes sentiment results into two simple categories: "Negative" and "Positive."

In [None]:
avg_sent['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent['doge_mean']]
avg_sent2['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent2['btc_mean']]

avg_sent_lemma['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent_lemma['doge_mean']]
avg_sent2_lemma['Sentiment'] = ["Negative" if x<0 else "Positive"
                         for x in avg_sent2_lemma['btc_mean']]

an output for showing the message count related to each coin in each day

In [None]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)

fig.add_trace(px.histogram(avg_sent, x="Day", y="doge_count", color="Sentiment", nbins=20).data[0], row=1, col=1)

fig.add_trace(px.histogram(avg_sent2, x="Day", y="btc_count", color="Sentiment", nbins=20).data[0], row=1, col=2)

fig.update_layout(
    title_text='Vader Sentiment Analysis Results',
    bargap=0.2, 
    bargroupgap=0.1 
)

fig.update_xaxes(title_text="Doge Coin", row=1, col=1)
fig.update_xaxes(title_text="Bitcoin", row=1, col=2)

fig.show()

for lemmatized text

In [None]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)

fig.add_trace(px.histogram(avg_sent_lemma, x="Day", y="doge_count", color="Sentiment", nbins=20).data[0], row=1, col=1)

fig.add_trace(px.histogram(avg_sent2_lemma, x="Day", y="btc_count", color="Sentiment", nbins=20).data[0], row=1, col=2)

fig.update_layout(
    title_text='Vader Sentiment Analysis Results',
    bargap=0.2, 
    bargroupgap=0.1 
)

fig.update_xaxes(title_text="Doge Coin", row=1, col=1)
fig.update_xaxes(title_text="Bitcoin", row=1, col=2)

fig.show()

overall, lemmatized text act better and more logical in sentiment vader analysis, however the charts illustrate a roughly similar figures.