# Imports

In [5]:
import pandas as pd
from tqdm import tqdm
import json
from datetime import datetime as dt
import ast

In [6]:
import sys
ROOT = '../'
sys.path.append(ROOT)  # Add the root folder to the sys.path

# Import the modules
from config import *

# Reload the configuration
from importlib import reload
reload(sys.modules['config'])

# Import the reloaded modules
from config import *

# Retrieve Bitcoin news

In [None]:
import subprocess
import json

# Define the parameters
start = 0 # Default: 0
MAX_LENGHT = 100 # Max: 100
offset = str(start) # From the last news published
per_page = str(MAX_LENGHT)

data = []
while True:
    try:
        print(f"Fetching data from {start} to {start + MAX_LENGHT}")
        # Define the curl command with parameters
        curl_command = [
            "curl",
            f"https://api.news.bitcoin.com/wp-json/bcn/v1/posts?offset={offset}&per_page={per_page}",
            "-H", "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
            "-H", "Accept: application/json, text/plain, */*",
            "-H", "Accept-Language: en-US,en;q=0.5",
            "-H", "Referer: https://news.bitcoin.com/",
            "-H", "Origin: https://news.bitcoin.com",
            "-H", "Connection: keep-alive",
            "-H", "Sec-Fetch-Dest: empty",
            "-H", "Sec-Fetch-Mode: cors",
            "-H", "Sec-Fetch-Site: same-site",
            "-H", "TE: trailers"
        ]

        # Execute the curl command
        result = subprocess.run(curl_command, capture_output=True, text=True).stdout

        # Turn result.stdout into json
        data.append(json.loads(result)['posts'])

        print(f"Data fetched from {start} to {start + MAX_LENGHT}")

        # Save the last date
        last_date = data[-1][-1]['date']
        
        # Convert into YYYY-MM-DD format
        last_date = dt.strptime(last_date, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d")

        print(f"last post published at {last_date}")

        # Check if the last date is before the start date
        if last_date < START_DATE:
            print(f"Reached the start date {START_DATE}")
            break

        # Increase the offset
        start += MAX_LENGHT
        offset = str(start)
    except Exception as e:
        # Reached the end of the data
        print(e)
        break

In [None]:
# # Print an example of the data
# print(data[0][0])

# # Show the keys of the data
# print(data[0][0].keys())

# # Show the length of the data
# print(len(data))

# # Show the length of the first data
# print(len(data[0]))

In [None]:
import pandas as pd

# List to hold dictionaries
news_list = []

# Loop through news items and create dictionaries
for i in range(len(data)):
    for d in data[i]:
        news_dict = {
            "id": d["id"],
            "date": d["date"],
            "title": d["title"],
            "slug": d["slug"],
            "author": d["author"]["name"]
        }
        news_list.append(news_dict)

# Create DataFrame from list of dictionaries
df = pd.DataFrame(news_list)

# Display the first 5 rows of the DataFrame
df

In [None]:
# Save the DataFrame to a CSV file
output_file = os.path.join(ROOT, NEWS_DATASET_PATH, 'bitcoin_news_raw.csv')
df.to_csv(output_file, index=False)

# [ADDON] Get "body" field of Bitcoin news

In [None]:
# Load the DataFrame from the CSV file
input_file = os.path.join(ROOT, NEWS_DATASET_PATH, 'bitcoin_news_raw.csv')
df = pd.read_csv(input_file)

In [None]:
# Add new column "body" to bitcoin_news_with_body_with_sentiment between "leadtext" and "sentiment"
bitcoin_news_with_body = df.copy()
bitcoin_news_with_body.insert(5, "body", 'None')
bitcoin_news_with_body

In [None]:
import re
from html import unescape

def clean_html_content(html_content):
    # Remove script tags and their contents
    html_content = re.sub(r'<script.*?</script>', '', html_content, flags=re.DOTALL)
    
    # Remove HTML tags
    html_content = re.sub(r'<[^>]+>', '', html_content)
    
    # Unescape HTML entities
    html_content = unescape(html_content)
    
    # Remove extra whitespace and blank lines
    lines = [line.strip() for line in html_content.split('\n') if line.strip()]
    cleaned_content = '\n'.join(lines)
    
    return cleaned_content

In [None]:
import subprocess
import json

for index, row in tqdm(bitcoin_news_with_null_body.iterrows(), total=len(bitcoin_news_with_null_body)):
    # Get the slug
    slug = row['slug']
    # Open the URL
    url = f'https://news.bitcoin.com/{slug}'

    try:
        # Define the curl command with parameters
        curl_command = [
            "curl",
            f"https://api.news.bitcoin.com/wp-json/bcn/v1/post?slug={slug}",
            "--compressed",
            "-H", "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
            "-H", "Accept: application/json, text/plain, */*",
            "-H", "Accept-Language: en-US,en;q=0.5",
            "-H", "Accept-Encoding: gzip, deflate",
            "-H", "Referer: https://news.bitcoin.com/",
            "-H", "Origin: https://news.bitcoin.com",
            "-H", "Connection: keep-alive",
            "-H", "Sec-Fetch-Dest: empty",
            "-H", "Sec-Fetch-Mode: cors",
            "-H", "Sec-Fetch-Site: same-site",
            "-H", "TE: trailers"
        ]

        # Execute the curl command
        result = subprocess.run(curl_command, capture_output=True, text=True).stdout

        # Extract the body from the result
        body = json.loads(result)['content']

        # Clean the body
        cleaned_body = clean_html_content(body)

        # Save the body into the dataset
        bitcoin_news_with_body.at[index, 'body'] = body
    except Exception as e:
        print(e)
        # Set the body to None
        bitcoin_news_with_body.at[index, 'body'] = 'None'

In [None]:
bitcoin_news_with_body

In [12]:
# Show nan values
bitcoin_news_with_body.isnull().sum()

id        0
date      0
title     0
slug      0
author    0
body      0
dtype: int64

In [10]:
# Remove nan values
bitcoin_news_with_body = bitcoin_news_with_body.dropna()

In [11]:
# Save the dataset to a CSV file
output_file = os.path.join(ROOT, NEWS_DATASET_PATH, 'bitcoin_news_with_body.csv')
bitcoin_news_with_body.to_csv(output_file, index=False)

# [ADDON] Clear Bitcoin News body

In [13]:
# Load the dataset from the CSV file
input_file = os.path.join(ROOT, NEWS_DATASET_PATH, 'bitcoin_news_with_body.csv')
bitcoin_news_with_body = pd.read_csv(input_file)
bitcoin_news_with_body

Unnamed: 0,id,date,title,slug,author,body
0,662463,2024-07-26 09:19:01,Coinbase and Glassnode Report Highlights a Sta...,coinbase-and-glassnode-report-highlights-a-sta...,Jamie Redman,A recent joint report by Coinbase Institutiona...
1,662457,2024-07-26 09:00:42,Disney Officially Enters Metaverse Market Thro...,disney-officially-enters-metaverse-market-thro...,Media,PRESS RELEASE. In a monumental development for...
2,662382,2024-07-26 08:35:57,US Department of State Highlights Bitcoin's Si...,us-department-of-state-highlights-bitcoins-sig...,Sergio Goschenko,The U.S. Department of State has highlighted t...
3,661958,2024-07-26 07:20:53,Lightning Labs Rolls out Taproot Assets Seekin...,lightning-labs-rolls-out-taproot-assets-seekin...,Sergio Goschenko,"Lightning Labs, an institution dedicated to re..."
4,662429,2024-07-26 04:00:06,Swapuz Starts Using Its Own Liquidity for Swap...,swapuz-starts-using-its-own-liquidity-for-swap...,Media,"PRESS RELEASE. Swapuz, a leading platform for ..."
...,...,...,...,...,...,...
34165,252,2015-03-01 08:00:00,"Bitcoin News Roundup - March 1st, 2015",bitcoin-news-roundup-march-1st-2015,jake,Bitcoin News Roundup is a weekly digest in whi...
34166,253,2015-02-22 08:00:00,"Bitcoin News Roundup - February 22nd, 2015",bitcoin-news-roundup-february-22nd-2015,jake,Bitcoin News Roundup is a weekly digest email ...
34167,254,2015-02-01 08:00:00,"Bitcoin News Roundup - February 1st, 2015",bitcoin-news-roundup-february-1st-2015,jake,Bitcoin News Roundup is a weekly digest in whi...
34168,255,2015-01-25 08:00:00,"Bitcoin News Roundup - January 25th, 2015",bitcoin-news-roundup-january-25th-2015,jake,Bitcoin News Roundup is a weekly digest in whi...


In [14]:
# Convert the body column to string
bitcoin_news_with_body['body'] = bitcoin_news_with_body['body'].astype(str)

In [15]:
bitcoin_news_with_body.loc[2000]['body']

'PRESS RELEASE. Dover, DE, USA, May8, 2024 — ChainGPT, the AI-powered Web3 infrastructure providing a diverse suite of tools and services, will exclusively launch the IDO of its latest launchpad project, Engines of Fury: a free-to-play top-down extraction shooter designed for Web3. Developed by talents from AAA titles, Blizzard, Activision, Ubisoft, and Unity, its top-down characteristic makes it a first within Web3&#8217;s free-to-play shooter arena. Set for May 8, Engines of Fury’s IDO leverages ChainGPT Pad’s acceleration program enabling the game to perfect its product systems, action plans, and roadmaps while receiving expert marketing and promotional support. As a premier decentralized fundraising and incubation platform for Web3 projects of all types, the ChainGPT Pad has been recognized as the most popular launchpad of 2023. The incubation program promotes emerging startups strategically hand-picked by ChainGPT based on their disruptive potential, transforming their ideas into 

In [16]:
import re

def clean_html(html_text):
    # Remove HTML tags
    clean_text = re.sub('<[^<]+?>', '', html_text)
    
    # Remove extra whitespace
    clean_text = re.sub('\s+', ' ', clean_text).strip()
    
    return clean_text

  clean_text = re.sub('\s+', ' ', clean_text).strip()


In [17]:
for index, row in tqdm(bitcoin_news_with_body.iterrows(), total=len(bitcoin_news_with_body)):
    # Clean the body
    body = row['body']
    cleaned_body = clean_html(body)

    # Save the cleaned body
    bitcoin_news_with_body.at[index, 'body'] = cleaned_body

100%|██████████| 34170/34170 [00:05<00:00, 6361.49it/s]


In [18]:
bitcoin_news_with_body.loc[2000]['body']

'PRESS RELEASE. Dover, DE, USA, May8, 2024 — ChainGPT, the AI-powered Web3 infrastructure providing a diverse suite of tools and services, will exclusively launch the IDO of its latest launchpad project, Engines of Fury: a free-to-play top-down extraction shooter designed for Web3. Developed by talents from AAA titles, Blizzard, Activision, Ubisoft, and Unity, its top-down characteristic makes it a first within Web3&#8217;s free-to-play shooter arena. Set for May 8, Engines of Fury’s IDO leverages ChainGPT Pad’s acceleration program enabling the game to perfect its product systems, action plans, and roadmaps while receiving expert marketing and promotional support. As a premier decentralized fundraising and incubation platform for Web3 projects of all types, the ChainGPT Pad has been recognized as the most popular launchpad of 2023. The incubation program promotes emerging startups strategically hand-picked by ChainGPT based on their disruptive potential, transforming their ideas into 

In [19]:
# Save the dataset to a CSV file
output_file = os.path.join(ROOT, NEWS_DATASET_PATH, 'bitcoin_news_with_body.csv')
bitcoin_news_with_body.to_csv(output_file, index=False)

# Generate daily and hourly Bitcoin News dataset

In [20]:
# Open bitcoin news dataset
bitcoin_news_with_body = pd.read_csv(os.path.join(ROOT, NEWS_DATASET_PATH, "bitcoin_news_with_body.csv"))
bitcoin_news_with_body

Unnamed: 0,id,date,title,slug,author,body
0,662463,2024-07-26 09:19:01,Coinbase and Glassnode Report Highlights a Sta...,coinbase-and-glassnode-report-highlights-a-sta...,Jamie Redman,A recent joint report by Coinbase Institutiona...
1,662457,2024-07-26 09:00:42,Disney Officially Enters Metaverse Market Thro...,disney-officially-enters-metaverse-market-thro...,Media,PRESS RELEASE. In a monumental development for...
2,662382,2024-07-26 08:35:57,US Department of State Highlights Bitcoin's Si...,us-department-of-state-highlights-bitcoins-sig...,Sergio Goschenko,The U.S. Department of State has highlighted t...
3,661958,2024-07-26 07:20:53,Lightning Labs Rolls out Taproot Assets Seekin...,lightning-labs-rolls-out-taproot-assets-seekin...,Sergio Goschenko,"Lightning Labs, an institution dedicated to re..."
4,662429,2024-07-26 04:00:06,Swapuz Starts Using Its Own Liquidity for Swap...,swapuz-starts-using-its-own-liquidity-for-swap...,Media,"PRESS RELEASE. Swapuz, a leading platform for ..."
...,...,...,...,...,...,...
34165,252,2015-03-01 08:00:00,"Bitcoin News Roundup - March 1st, 2015",bitcoin-news-roundup-march-1st-2015,jake,Bitcoin News Roundup is a weekly digest in whi...
34166,253,2015-02-22 08:00:00,"Bitcoin News Roundup - February 22nd, 2015",bitcoin-news-roundup-february-22nd-2015,jake,Bitcoin News Roundup is a weekly digest email ...
34167,254,2015-02-01 08:00:00,"Bitcoin News Roundup - February 1st, 2015",bitcoin-news-roundup-february-1st-2015,jake,Bitcoin News Roundup is a weekly digest in whi...
34168,255,2015-01-25 08:00:00,"Bitcoin News Roundup - January 25th, 2015",bitcoin-news-roundup-january-25th-2015,jake,Bitcoin News Roundup is a weekly digest in whi...


In [21]:
# Show the row that have as 'timestamp' equals to 2016-05-01 22:55:15
bitcoin_news_with_body[bitcoin_news_with_body['date'] == '2016-05-01 22:55:15']

Unnamed: 0,id,date,title,slug,author,body
33143,15032,2016-05-01 22:55:15,The Bitcoin.com Podcast: Voorhees on the Shape...,podcast-voorhees-shapeshift-hack,Bitcoin.com,On the latest episode of the Bitcoin.com Podca...


In [22]:
# Generate two temp dataset (daily and hourly) with just the timestamp column that start from START_DATE and end at END_DATE
# Daily dataset has the format: 2024-03-16
# Hourly dataset has the format: 2024-03-16 20:00:00

start_date = bitcoin_news_with_body['date'].min() # Default: START_DATE
end_date = bitcoin_news_with_body['date'].max() # Default: END_DATE

bitcoin_news_daily = pd.date_range(start=start_date, end=end_date, freq='D').to_frame(index=False, name='timestamp')
bitcoin_news_daily['timestamp'] = bitcoin_news_daily['timestamp'].dt.date

# start_date = bitcoin_news_with_body['date'].min()+" 00:00:00" # Default: START_DATE

# # For the hourly dataset, generate 2 coluns: timestamp_begin and timestamp_end where timestamp_end is timestamp_begin + 1 hour
# bitcoin_news_hourly = pd.date_range(start=start_date, end=END_DATE, freq='h').to_frame(index=False, name='timestamp')
# bitcoin_news_hourly['timestamp_begin'] = bitcoin_news_hourly['timestamp']
# bitcoin_news_hourly['timestamp_end'] = bitcoin_news_hourly['timestamp'] + pd.Timedelta(hours=1)
# bitcoin_news_hourly.drop(columns=['timestamp'], inplace=True)

# Turn the timestamp column into string
bitcoin_news_daily['timestamp'] = bitcoin_news_daily['timestamp'].astype(str)
# bitcoin_news_hourly['timestamp_begin'] = bitcoin_news_hourly['timestamp_begin'].astype(str)
# bitcoin_news_hourly['timestamp_end'] = bitcoin_news_hourly['timestamp_end'].astype(str)

# Add 'bitcoin_news' column
bitcoin_news_daily['bitcoin_news'] = None
# bitcoin_news_hourly['bitcoin_news'] = None

In [23]:
bitcoin_news_daily

Unnamed: 0,timestamp,bitcoin_news
0,2015-01-18,
1,2015-01-19,
2,2015-01-20,
3,2015-01-21,
4,2015-01-22,
...,...,...
3473,2024-07-22,
3474,2024-07-23,
3475,2024-07-24,
3476,2024-07-25,


In [24]:
# bitcoin_news_hourly

In [25]:
# Generate bitcoin_news_daily
bitcoin_news_daily_copy = bitcoin_news_daily.copy()

# Iterate over the daily dataset
for index, row in tqdm(bitcoin_news_daily_copy.iterrows(), total=len(bitcoin_news_daily_copy)):
    # Get the timestamp
    curr_timestamp = row['timestamp']
    # Select the bitcoin_news rows items that have been published during the curr_timestamp, save them as a list
    # Example: if curr_timestamp is 2018-01-01, then select the news items that have been published from 2018-01-01 00:00:00 to 2018-01-01 23:59:59, news = [news1, news2, news3]
    filtered_news = bitcoin_news_with_body[(bitcoin_news_with_body['date'].str.contains(curr_timestamp))]

    if len(filtered_news) == 0:
        bitcoin_news_daily_copy.at[index, 'bitcoin_news'] = str([])
    else:
        # Convert news dataframe to a list
        filtered_news = str(filtered_news.values.tolist())
        # Append the news list to the news column
        bitcoin_news_daily_copy.at[index, 'bitcoin_news'] = filtered_news
bitcoin_news_daily_copy

100%|██████████| 3478/3478 [00:26<00:00, 133.28it/s]


Unnamed: 0,timestamp,bitcoin_news
0,2015-01-18,"[[256, '2015-01-18 08:00:00', 'Bitcoin News Ro..."
1,2015-01-19,[]
2,2015-01-20,[]
3,2015-01-21,[]
4,2015-01-22,[]
...,...,...
3473,2024-07-22,"[[661576, '2024-07-22 23:30:28', 'India Favors..."
3474,2024-07-23,"[[661735, '2024-07-23 23:37:46', 'Sam Altman S..."
3475,2024-07-24,"[[661759, '2024-07-24 23:35:36', 'Hong Kong La..."
3476,2024-07-25,"[[662316, '2024-07-25 23:30:11', 'Russia-Linke..."


In [26]:
# Count the number of NOT empty rows (different from []) in the bitcoin_news column
not_empty_rows = bitcoin_news_daily_copy[bitcoin_news_daily_copy['bitcoin_news'] != '[]']
print("Total number of NOT '[]' occurrences in the bitcoin_news column:", not_empty_rows.shape[0])
not_empty_rows

Total number of NOT '[]' occurrences in the bitcoin_news column: 3328


Unnamed: 0,timestamp,bitcoin_news
0,2015-01-18,"[[256, '2015-01-18 08:00:00', 'Bitcoin News Ro..."
7,2015-01-25,"[[255, '2015-01-25 08:00:00', 'Bitcoin News Ro..."
14,2015-02-01,"[[254, '2015-02-01 08:00:00', 'Bitcoin News Ro..."
35,2015-02-22,"[[253, '2015-02-22 08:00:00', 'Bitcoin News Ro..."
42,2015-03-01,"[[252, '2015-03-01 08:00:00', 'Bitcoin News Ro..."
...,...,...
3473,2024-07-22,"[[661576, '2024-07-22 23:30:28', 'India Favors..."
3474,2024-07-23,"[[661735, '2024-07-23 23:37:46', 'Sam Altman S..."
3475,2024-07-24,"[[661759, '2024-07-24 23:35:36', 'Hong Kong La..."
3476,2024-07-25,"[[662316, '2024-07-25 23:30:11', 'Russia-Linke..."


In [27]:
# Show an example
row = not_empty_rows['bitcoin_news'][0]
print(f"Row has {len(row)} news")
print(f"List of news: {row}")
row

Row has 9228 news
List of news: [[256, '2015-01-18 08:00:00', 'Bitcoin News Roundup - January 18th, 2015', 'bitcoin-news-roundup-january-18th-2015', 'jake', 'Bitcoin News Roundup is a weekly digest email in which Jake recap the week&#x2019;s bitcoin news, interesting stories, and articles. ROSS ULBRICHT TRIAL This week saw the beginning of alleged Silk Road operator Ross Ulbricht&#x2019;s trial in a federal courthouse in Manhattan. Ulbricht&#x2019;s counsel began the trial on an interesting note, arguing that Ulbricht indeed founded the site, but that he was not the Dread Pirate Roberts, the mysterious operator of the online free marketplace. In a cross-examination of DHS agent Jared Der-Yeghiayan on the third day of the trial, Ulbricht&#x2019;s defense team coaxed a serious shocker from the agent who had infiltrated the Silk Road as an employee: &#x201C;You believed him to be the mastermind behind Silk Road, keeping it secure and operating?&#x201D; Ulbricht defense attorney Dratel ask

"[[256, '2015-01-18 08:00:00', 'Bitcoin News Roundup - January 18th, 2015', 'bitcoin-news-roundup-january-18th-2015', 'jake', 'Bitcoin News Roundup is a weekly digest email in which Jake recap the week&#x2019;s bitcoin news, interesting stories, and articles. ROSS ULBRICHT TRIAL This week saw the beginning of alleged Silk Road operator Ross Ulbricht&#x2019;s trial in a federal courthouse in Manhattan. Ulbricht&#x2019;s counsel began the trial on an interesting note, arguing that Ulbricht indeed founded the site, but that he was not the Dread Pirate Roberts, the mysterious operator of the online free marketplace. In a cross-examination of DHS agent Jared Der-Yeghiayan on the third day of the trial, Ulbricht&#x2019;s defense team coaxed a serious shocker from the agent who had infiltrated the Silk Road as an employee: &#x201C;You believed him to be the mastermind behind Silk Road, keeping it secure and operating?&#x201D; Ulbricht defense attorney Dratel asked Der-Yeghiayan. &#x201C;I did

In [28]:
# Count the number of empty rows (equal to '[]') in the bitcoin_news column
empty_rows = bitcoin_news_daily_copy[bitcoin_news_daily_copy['bitcoin_news'] == '[]']
print("Total number of '[]' occurrences in the bitcoin_news column:", empty_rows.shape[0])
empty_rows

Total number of '[]' occurrences in the bitcoin_news column: 150


Unnamed: 0,timestamp,bitcoin_news
1,2015-01-19,[]
2,2015-01-20,[]
3,2015-01-21,[]
4,2015-01-22,[]
5,2015-01-23,[]
...,...,...
193,2015-07-30,[]
194,2015-07-31,[]
196,2015-08-02,[]
805,2017-04-02,[]


In [29]:
# Set the timestamp as the index
bitcoin_news_daily_copy = bitcoin_news_daily_copy.set_index('timestamp', drop=False)
bitcoin_news_daily = bitcoin_news_daily_copy
bitcoin_news_daily

Unnamed: 0_level_0,timestamp,bitcoin_news
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-18,2015-01-18,"[[256, '2015-01-18 08:00:00', 'Bitcoin News Ro..."
2015-01-19,2015-01-19,[]
2015-01-20,2015-01-20,[]
2015-01-21,2015-01-21,[]
2015-01-22,2015-01-22,[]
...,...,...
2024-07-22,2024-07-22,"[[661576, '2024-07-22 23:30:28', 'India Favors..."
2024-07-23,2024-07-23,"[[661735, '2024-07-23 23:37:46', 'Sam Altman S..."
2024-07-24,2024-07-24,"[[661759, '2024-07-24 23:35:36', 'Hong Kong La..."
2024-07-25,2024-07-25,"[[662316, '2024-07-25 23:30:11', 'Russia-Linke..."


In [30]:
# # Generate bitcoin_news
# bitcoin_news_hourly_copy = bitcoin_news_hourly.copy()

# # Iterate over the hourly dataset
# for index, row in tqdm(bitcoin_news_hourly_copy.iterrows(), total=len(bitcoin_news_hourly_copy)):
#     # Get the timestamp
#     timestamp_begin = row['timestamp_begin']
#     timestamp_end = row['timestamp_end']
#     # Select the bitcoin_news rows items that have been published between timestamp_begin and timestamp_end, save them as a list
#     # Example: if timestamp_begin is 2018-01-01 00:00:00 and timestamp_end is 2018-01-01 01:00:00
#     # select the news items that have been published from 2018-01-01 00:00:00 to 2018-01-01 00:59:59
#     # news = [news1, news2, news3]
#     filtered_news = bitcoin_news_with_body[
#         (bitcoin_news_with_body['date'] >= timestamp_begin) & 
#         (bitcoin_news_with_body['date'] < timestamp_end)
#     ]

#     if len(filtered_news) == 0:
#         bitcoin_news_hourly_copy.at[index, 'bitcoin_news'] = str([])
#     else:
#         # Convert news dataframe to a list
#         filtered_news = str(filtered_news.values.tolist())
#         # Append the news list to the news column
#         bitcoin_news_hourly_copy.at[index, 'bitcoin_news'] = filtered_news
# bitcoin_news_hourly_copy

In [31]:
# # Count the number of NOT empty rows (different from []) in the bitcoin_news column
# not_empty_rows = bitcoin_news_hourly_copy[bitcoin_news_hourly_copy['bitcoin_news'] != '[]']
# print("Total number of NOT '[]' occurrences in the bitcoin_news column:", not_empty_rows.shape[0])
# not_empty_rows

In [32]:
# # Show an example
# row = not_empty_rows['bitcoin_news'][83457]
# print(f"Row has {len(row)} news")
# print(f"List of news: {row}")
# row

In [33]:
# # Count the number of empty rows (equal to '[]') in the bitcoin_news column
# empty_rows = bitcoin_news_hourly_copy[bitcoin_news_hourly_copy['bitcoin_news'] == '[]']
# print("Total number of '[]' occurrences in the bitcoin_news column:", empty_rows.shape[0])
# empty_rows

In [34]:
# # Set the timestamp as the index
# bitcoin_news_hourly_copy = bitcoin_news_hourly_copy.set_index('timestamp_begin', drop=False)
# bitcoin_news_hourly = bitcoin_news_hourly_copy
# bitcoin_news_hourly

In [35]:
# Save the datasets
bitcoin_news_daily.to_csv(os.path.join(ROOT, NEWS_DATASET_PATH, "bitcoin_news_daily_grouped.csv"), index=False)

# Save the datasets
# bitcoin_news_hourly.to_csv(os.path.join(ROOT, NEWS_DATASET_PATH, "bitcoin_news_hourly_grouped.csv"), index=False)