# Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import ast

In [2]:
import sys
ROOT = '../'
sys.path.append(ROOT)  # Add the root folder to the sys.path

# Import the modules
from config import *

# Reload the configuration
from importlib import reload
reload(sys.modules['config'])

# Import the reloaded modules
from config import *

# Load datasets

In [3]:
MERGED_DAILY_DATASET_PATH = os.path.join(ROOT, MERGED_DATASET_PATH, 'merged_daily.csv')
MERGED_HOURLY_DATASET_PATH = os.path.join(ROOT, MERGED_DATASET_PATH, 'merged_hourly.csv')

In [5]:
# Load the datasets
merged_daily = pd.read_csv(MERGED_DAILY_DATASET_PATH)
# merged_hourly = pd.read_csv(MERGED_HOURLY_DATASET_PATH)

In [None]:
merged_daily

In [None]:
# merged_hourly

# Rows number that have both "cointelegraph" and "reddit" columns equal to '[]'

In [None]:
# Rows number that have both "cointelegraph" and "reddit" columns equal to []
merged_daily[(merged_daily['cointelegraph'] == '[]') & (merged_daily['reddit'] == '[]')]

In [None]:
# Rows number that have both "cointelegraph" and "reddit" columns equal to []
merged_hourly[(merged_hourly['cointelegraph'] == '[]') & (merged_hourly['reddit'] == '[]')]

# Count the number of words in comments/posts/news (using merged_daily)

In [None]:
# Import nltk
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

In [None]:
# Count the number of words in each column
daily_news_words = []
daily_reddit_submissions_words = []
daily_reddit_comments_words = []

for i, row in tqdm(merged_daily.iterrows(), total=len(merged_daily)):
    try:
        # Count the number of words in the "cointelegraph" column
        cointelegraph_news = ast.literal_eval(row['cointelegraph'])
    except:
        print(i)
        print(row['cointelegraph'])
    for news in cointelegraph_news:
        if news == '[]':
            daily_news_words.append(0)
        else:
            # Count only the number of words in the "title" ([3]), "leadtext" ([5]) and "body" ([6]) columns
            daily_news_words.append(len(word_tokenize(news[3])) + len(word_tokenize(news[5])) + len(word_tokenize(news[6])))
    
    # Count the number of words in the "reddit" column
    try:
        reddit_submissions = ast.literal_eval(row['reddit'])
    except:
        print(i)
        print(row['reddit'])
    submissions_word_count = 0
    comments_word_count = 0
    for submission in reddit_submissions:
        # Count only the number of words in the "title" ([1]) and "text" ([5]) columns
        if submission == '[]':
            daily_reddit_submissions_words.append(0)
        else:
            submissions_word_count += len(word_tokenize(submission[1])) + len(word_tokenize(submission[5]))
            # Also for each comment in the submission ([12]), count the number of words in the "text" ([4]) column
            # If there are no comments, append 0
            comments = submission[12] if submission[12] != '[]' else [] 
            for comment in comments: # Comments
                if comment == '[]':
                    comments_word_count += 0
                else:
                    comments_word_count += len(word_tokenize(comment[4]))
            daily_reddit_comments_words.append(comments_word_count) # Append the comments word count
    daily_reddit_submissions_words.append(submissions_word_count) # Append the submissions word count

In [None]:
# Print the number of words
print("News Words: ", sum(daily_news_words))
print("Reddit Submissions Words: ", sum(daily_reddit_submissions_words))
print("Reddit Comments Words: ", sum(daily_reddit_comments_words))

# Print the mean number of words
print("Mean News Words: ", sum(daily_news_words) / len(daily_news_words))
print("Mean Reddit Submissions Words: ", sum(daily_reddit_submissions_words) / len(daily_reddit_submissions_words))
print("Mean Reddit Comments Words: ", sum(daily_reddit_comments_words) / len(daily_reddit_comments_words))

# Print the max number of words 
print("Max News Words: ", max(daily_news_words))
print("Max Reddit Submissions Words: ", max(daily_reddit_submissions_words))
print("Max Reddit Comments Words: ", max(daily_reddit_comments_words))


# Check when the price has changed significantly and whether the news / posts / comments reflect this trend

In [None]:
# Find max and min values into the pct_price_change column
print("Max Price Change in daily dataset: ", merged_daily['pct_price_change'].max())
print("Min Price Change in daily dataset", merged_daily['pct_price_change'].min())

# Find max and min values into the pct_price_change column
print("Max Price Change in hourly dataset: ", merged_hourly['pct_price_change'].max())
print("Min Price Change in hourly dataset: ", merged_hourly['pct_price_change'].min())

In [13]:
# Select the rows that have the max and min values in the pct_price_change column
max_price_change_daily = merged_daily[merged_daily['pct_price_change'] == merged_daily['pct_price_change'].max()]
min_price_change_daily = merged_daily[merged_daily['pct_price_change'] == merged_daily['pct_price_change'].min()]

max_price_change_hourly = merged_hourly[merged_hourly['pct_price_change'] == merged_hourly['pct_price_change'].max()]
min_price_change_hourly = merged_hourly[merged_hourly['pct_price_change'] == merged_hourly['pct_price_change'].min()]

## Daily dataset

In [None]:
max_price_change_daily

In [15]:
# Select the news and reddit post
cointelegraph_news = []
reddit_submissions = []

cointelegraph_news = ast.literal_eval(max_price_change_daily['cointelegraph'].values[0])
reddit_submissions = ast.literal_eval(max_price_change_daily['reddit'].values[0])

In [None]:
cointelegraph_news

In [None]:
reddit_submissions

In [None]:
min_price_change_daily

In [19]:
# Select the news and reddit post
cointelegraph_news = []
reddit_submissions = []

cointelegraph_news = ast.literal_eval(min_price_change_daily['cointelegraph'].values[0])
reddit_submissions = ast.literal_eval(min_price_change_daily['reddit'].values[0])

In [None]:
cointelegraph_news

In [None]:
reddit_submissions

## Hourly dataset

In [None]:
max_price_change_hourly

In [None]:
# Select 5 rows before and after the max price change row
max_price_change_hourly_index = max_price_change_hourly.index[0]
max_price_change_hourly_before_and_after = merged_hourly.iloc[max_price_change_hourly_index - 5: max_price_change_hourly_index + 5]
max_price_change_hourly_before_and_after

In [24]:
# Select the news and reddit post
cointelegraph_news = []
reddit_submissions = []

cointelegraph_news = ast.literal_eval(max_price_change_hourly_before_and_after['cointelegraph'].values[6])
reddit_submissions = ast.literal_eval(max_price_change_hourly_before_and_after['reddit'].values[6])

In [None]:
cointelegraph_news

In [None]:
reddit_submissions

In [None]:
min_price_change_hourly

In [None]:
# Select 5 rows before and after the min price change row
min_price_change_hourly_index = min_price_change_hourly.index[0]
min_price_change_hourly_before_and_after = merged_hourly.iloc[min_price_change_hourly_index - 5: min_price_change_hourly_index + 5]
min_price_change_hourly_before_and_after

In [29]:
# Select the news and reddit post
cointelegraph_news = []
reddit_submissions = []

cointelegraph_news = ast.literal_eval(min_price_change_hourly_before_and_after['cointelegraph'].values[6])
reddit_submissions = ast.literal_eval(min_price_change_hourly_before_and_after['reddit'].values[8])

In [None]:
cointelegraph_news

In [None]:
reddit_submissions

# Check how the price has changed  

## Daily dataset

In [32]:
merged_daily = pd.read_csv(MERGED_DAILY_DATASET_PATH)

In [None]:
merged_daily

In [None]:
# Count the number of same, up, down trend values
merged_daily['trend'].value_counts()

In [None]:
# Plot the sentiment distribution
plt.hist(merged_daily['trend'], bins=3, rwidth=0.8)
plt.xlabel('Trend')
plt.ylabel('Frequency')
plt.title('Trend Distribution')
plt.show()


## Hourly dataset

In [36]:
merged_hourly = pd.read_csv(MERGED_HOURLY_DATASET_PATH)

In [None]:
merged_hourly

In [None]:
# Count the number of same, up, down trend values
merged_hourly['trend'].value_counts()

In [None]:
# Plot the sentiment distribution
plt.hist(merged_hourly['trend'], bins=3, rwidth=0.8)
plt.xlabel('Trend')
plt.ylabel('Frequency')
plt.title('Trend Distribution')
plt.show()