In [1]:
import requests
import datetime
import pandas as pd

# Define API base URL and endpoints
BASE_URL = 'https://api.blockchain.info/charts/'
ENDPOINTS = {
    'hash_rate': 'hash-rate',
    'active_addresses': 'n-unique-addresses',
    'miner_revenue': 'miners-revenue'
}

# Function to generate date ranges for 1-year intervals starting from 2014-11-15
def generate_intervals(start_date):
    today = datetime.date.today()
    intervals = []

    while start_date <= today:  # Ensure the loop includes all days up to today
        end_date = min(start_date + datetime.timedelta(days=364), today)  # One year (adjusted for leap years)
        intervals.append((start_date, end_date))
        start_date = end_date + datetime.timedelta(days=1)  # Move to the next day after the interval

    return intervals

# Function to fetch data from the API for a specific timespan
def fetch_data(metric, start_date, end_date):
    url = f"{BASE_URL}{metric}"
    params = {
        'start': start_date.isoformat(),
        'end': end_date.isoformat(),
        'format': 'json'
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json().get('values', [])
    else:
        print(f"Failed to fetch data for {metric} from {start_date} to {end_date}.")
        return []

# Fetch data for all metrics and intervals
def fetch_all_metrics(start_date):
    intervals = generate_intervals(start_date)
    all_data = {metric: [] for metric in ENDPOINTS.keys()}

    for metric, endpoint in ENDPOINTS.items():
        print(f"Fetching data for {metric}...")
        for start_date, end_date in intervals:
            data = fetch_data(endpoint, start_date, end_date)
            all_data[metric].extend(data)

    return all_data

# Combine data into a single DataFrame for analysis
def combine_data(all_data):
    combined_df = pd.DataFrame()
    for metric, data in all_data.items():
        metric_df = pd.DataFrame(data)
        metric_df['metric'] = metric
        combined_df = pd.concat([combined_df, metric_df], ignore_index=True)
    return combined_df

# Main
if __name__ == "__main__":
    START_DATE = datetime.date(2014, 11, 16)  # Start date

    # Fetch data
    all_data = fetch_all_metrics(START_DATE)

    # Combine and save data
    combined_df = combine_data(all_data)
    # combined_df.to_csv("../data/raw/blockchain_metrics.csv", index=False)
    # print("Data saved to blockchain_metrics.csv")
    print(combined_df)

Fetching data for hash_rate...
Fetching data for active_addresses...
Fetching data for miner_revenue...
                x             y         metric
0      1416096000  2.854628e+05      hash_rate
1      1416182400  2.992438e+05      hash_rate
2      1416268800  2.953063e+05      hash_rate
3      1416355200  2.744556e+05      hash_rate
4      1416441600  2.744556e+05      hash_rate
...           ...           ...            ...
11170  1736726400  3.803087e+07  miner_revenue
11171  1736812800  4.802407e+07  miner_revenue
11172  1736899200  4.336063e+07  miner_revenue
11173  1736985600  4.056123e+07  miner_revenue
11174  1737072000  4.908850e+07  miner_revenue

[11175 rows x 3 columns]


In [2]:
combined_df

Unnamed: 0,x,y,metric
0,1416096000,2.854628e+05,hash_rate
1,1416182400,2.992438e+05,hash_rate
2,1416268800,2.953063e+05,hash_rate
3,1416355200,2.744556e+05,hash_rate
4,1416441600,2.744556e+05,hash_rate
...,...,...,...
11170,1736726400,3.803087e+07,miner_revenue
11171,1736812800,4.802407e+07,miner_revenue
11172,1736899200,4.336063e+07,miner_revenue
11173,1736985600,4.056123e+07,miner_revenue


In [3]:
import pandas as pd
from datetime import datetime

# Load the dataset
df = pd.read_csv("../data/raw/blockchain_metrics.csv")

# Convert 'x' from Unix timestamp to datetime in the desired format
df['x'] = pd.to_datetime(df['x'], unit='s', utc=True)

# Find the earliest date to calculate "days since the first record"
start_date = df['x'].min()
df['days_since_start'] = (df['x'] - start_date).dt.days

# Preserve the original datetime in a separate column for clarity
df['date'] = df['x']  # Copy of the formatted datetime column

# Generate a complete range of 'days_since_start' (0 to max days)
all_days = pd.DataFrame({'days_since_start': range(df['days_since_start'].max() + 1)})

# Handle duplicates by aggregating (e.g., take the mean for duplicate combinations)
df = df.groupby(['days_since_start', 'metric'], as_index=False).agg({'y': 'mean', 'date': 'first'})

# Pivot the dataset to spread metrics into separate columns
pivot_df = df.pivot(index='days_since_start', columns='metric', values='y').reset_index()

# Merge with all_days to ensure no missing days
pivot_df = all_days.merge(pivot_df, on='days_since_start', how='left')

# Add the original datetime (formatted) back to the pivoted DataFrame
pivot_df['date'] = start_date + pd.to_timedelta(pivot_df['days_since_start'], unit='d')

# Optional: Rename columns for clarity
pivot_df.rename(columns={'days_since_start': 'days', 
                         'hash_rate': 'Hash Rate', 
                         'active_addresses': 'Active Addresses', 
                         'miner_revenue': 'Miner Revenue'}, inplace=True)

# Drop the first "days" column
pivot_df.drop(columns=['days'], inplace=True)

# Reorder the columns to make "date" the first column
columns = ['date'] + [col for col in pivot_df.columns if col != 'date']
pivot_df = pivot_df[columns]

# Save the modified dataset
pivot_df.to_csv("../data/processed/blockchain_metrics_updated.csv", index=False)

Data successfully modified and saved to data/blockchain_metrics_updated.csv
