Import libraries and set up API key

In [60]:
import json
import requests
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tools.eval_measures import aic, bic

# Insert your API key here
API_KEY = '2fvIsbHL7chqQIfpyhaUzMwjy47'
num_data_points = 100
start_date_str = '2023-01-01'

Get user input

In [61]:
# Insert your API key here
#API_KEY = str(input("Insert your API key here"))
#Ask user for input
#num_data_points = int(input("How many data points do you want to load? "))
#start_date = input("Enter the starting date (YYYY-MM-DD): ")

# Convert start date to Unix timestamp
start_datetime = datetime.strptime(start_date_str, "%Y-%m-%d")
start_timestamp = int(start_datetime.timestamp())

# Calculate end date and convert to Unix timestamp
end_datetime = start_datetime + timedelta(days=num_data_points)
end_timestamp = int(end_datetime.timestamp())

print(f"Start timestamp: {start_timestamp}")
print(f"End timestamp: {end_timestamp}")

Start timestamp: 1672543800
End timestamp: 1681180200


Define the function to fetch data

In [62]:
def fetch_glassnode_data(endpoint, asset='BTC'):
    url = f'https://api.glassnode.com/v1/metrics/{endpoint}'
    params = {
        'a': asset,
        'api_key': API_KEY,
        's': start_timestamp,
        'u': end_timestamp,
        'i': '24h'  # daily interval
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        df = pd.read_json(response.text, convert_dates=['t'])
        column_name = endpoint.split('/')[-1]
        df = df.rename(columns={'t': 'timestamp', 'v': column_name})
        return df.set_index('timestamp')
    else:
        print(f"Error fetching data from {endpoint}: {response.status_code}")
        return pd.DataFrame()

Fetch price data

In [63]:
# Fetch price data
price_df = fetch_glassnode_data('market/price_usd_close')
price_df.head()

  df = pd.read_json(response.text, convert_dates=['t'])


Unnamed: 0_level_0,price_usd_close
timestamp,Unnamed: 1_level_1
2023-01-01,16620.819927
2023-01-02,16693.930009
2023-01-03,16682.715885
2023-01-04,16865.159755
2023-01-05,16841.821596


Define endpoints and fetch feature data

In [64]:
# List of endpoint URLs (add more as needed)
endpoints = [
    'indicators/sopr',
    'indicators/sopr_less_155',
    'indicators/sopr_account_based',
    'indicators/sopr_adjusted'
]

# Fetch and combine feature data
features_df = pd.DataFrame()
for endpoint in endpoints:
    df = fetch_glassnode_data(endpoint)
    if features_df.empty:
        features_df = df
    else:
        features_df = features_df.join(df, how='outer')

features_df.head()

  df = pd.read_json(response.text, convert_dates=['t'])
  df = pd.read_json(response.text, convert_dates=['t'])
  df = pd.read_json(response.text, convert_dates=['t'])
  df = pd.read_json(response.text, convert_dates=['t'])


Unnamed: 0_level_0,sopr,sopr_less_155,sopr_account_based,sopr_adjusted
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-01-01,0.993143,0.9953,0.906646,0.982933
2023-01-02,0.997344,1.001048,0.913555,0.993102
2023-01-03,0.987792,0.995558,0.916211,0.971039
2023-01-04,0.985428,0.999432,0.892713,0.968075
2023-01-05,0.99095,0.997645,0.963089,0.978652


Combine data and handle missing values

In [65]:
# Concatenate price and features dataframes
combined_df = pd.concat([price_df, features_df], axis=1)

# Handle missing values
combined_df = combined_df.dropna()

# Display the result
combined_df.head()

Unnamed: 0_level_0,price_usd_close,sopr,sopr_less_155,sopr_account_based,sopr_adjusted
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,16620.819927,0.993143,0.9953,0.906646,0.982933
2023-01-02,16693.930009,0.997344,1.001048,0.913555,0.993102
2023-01-03,16682.715885,0.987792,0.995558,0.916211,0.971039
2023-01-04,16865.159755,0.985428,0.999432,0.892713,0.968075
2023-01-05,16841.821596,0.99095,0.997645,0.963089,0.978652


 Additional analysis or visualization (optional)

In [66]:
# You can add more cells here for further analysis or visualization
combined_df.describe()

Unnamed: 0,price_usd_close,sopr,sopr_less_155,sopr_account_based,sopr_adjusted
count,100.0,100.0,100.0,100.0,100.0
mean,23415.304798,1.000595,1.016027,1.009464,1.002454
std,3353.406517,0.016398,0.01812,0.052797,0.035659
min,16620.819927,0.880878,0.961018,0.883769,0.766452
25%,21804.378982,0.996025,1.003279,0.970866,0.990014
50%,23164.313442,1.00337,1.016252,1.018884,1.009144
75%,25526.203345,1.009071,1.027639,1.041921,1.019797
max,29628.544376,1.029593,1.076041,1.109142,1.070677


Define & Run the Stationary Check

In [67]:
#define the stationary check function
def make_stationary(series):
    def adf_test(timeseries):
        result = adfuller(timeseries, autolag='AIC')
        return result[1] <= 0.05
    diff_order = 0
    while not adf_test(series) and diff_order < 2:
        series = series.diff().dropna()
        diff_order += 1
    return series, diff_order

# Make price stationary
price_stationary, price_diff_order = make_stationary(price_df)
features_stationary = pd.DataFrame()

# Make features stationary
features_stationary_order = []

for col in features_df.columns:
    stationary_series, diff_order = make_stationary(features_df[col])
    features_stationary[col] = stationary_series
    features_stationary_order.append(diff_order)

# Ensure the index of features_stationary matches the original
features_stationary.index = stationary_series.index

# Create a DataFrame with the differencing orders
diff_order_df = pd.DataFrame({'Feature': features_df.columns, 'Differencing_Order': features_stationary_order})

# Display results
print("Stationary Features DataFrame:")
print(features_stationary.head())
print("\nDifferencing Orders:")
print(diff_order_df)

Stationary Features DataFrame:
                sopr  sopr_less_155  sopr_account_based  sopr_adjusted
timestamp                                                             
2023-01-01  0.993143       0.995300                 NaN       0.982933
2023-01-02  0.997344       1.001048            0.006909       0.993102
2023-01-03  0.987792       0.995558            0.002656       0.971039
2023-01-04  0.985428       0.999432           -0.023498       0.968075
2023-01-05  0.990950       0.997645            0.070376       0.978652

Differencing Orders:
              Feature  Differencing_Order
0                sopr                   0
1       sopr_less_155                   0
2  sopr_account_based                   1
3       sopr_adjusted                   0


Define & Run the Lag Value

In [68]:
def optimize_lag(price, feature, max_lag=10):
    # Remove NaN and inf values
    data = pd.concat([price, feature], axis=1).replace([np.inf, -np.inf], np.nan).dropna()
    
    if len(data) <= max_lag:
        return 1, [np.inf]  # Return default values if series is too short
    
    aic_values = []
    for lag in range(1, max_lag + 1):
        try:
            # Use feature lags to predict price
            model = AutoReg(data.iloc[:, 0], lags=lag, exog=data.iloc[:, 1])
            results = model.fit()
            aic_values.append(aic(results.llf, results.nobs, results.df_model))
        except:
            aic_values.append(np.inf)  # Use inf for failed models
    
    best_lag = np.argmin(aic_values) + 1
    return best_lag, aic_values

# Assuming price_stationary is your stationary price series
# Optimize lag for each feature and store AIC values
optimal_lags = {}
aic_values = {}
for col in features_stationary.columns:
    feature = features_stationary[col].copy()  # Create a copy to avoid modifying original data
    if feature.isnull().all():  # Check if the entire series is NaN
        optimal_lags[col] = 1
        aic_values[col] = [np.inf] * 10  # Assuming max_lag=10
    else:
        optimal_lags[col], aic_values[col] = optimize_lag(price_stationary, feature)

# Create DataFrames for optimal lags and AIC values
optimal_lags_df = pd.DataFrame.from_dict(optimal_lags, orient='index', columns=['Optimal_Lag'])
aic_values_df = pd.DataFrame(aic_values)

# Display results
print("Optimal Lags:")
print(optimal_lags_df)
print("\nAIC Values:")
print(aic_values_df.head())  # Showing only the first few rows of AIC values

Optimal Lags:
                    Optimal_Lag
sopr                         10
sopr_less_155                10
sopr_account_based           10
sopr_adjusted                10

AIC Values:
          sopr  sopr_less_155  sopr_account_based  sopr_adjusted
0  1547.970564    1532.602220         1539.936070    1548.449266
1  1535.088421    1519.452480         1526.315531    1535.596822
2  1522.206525    1505.680682         1513.151697    1522.703606
3  1509.365401    1489.806539         1499.155766    1509.819438
4  1495.320113    1474.583304         1485.706816    1495.692608
