In [79]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from linearmodels.panel import PanelOLS


In [80]:
df = pd.read_csv('data/clean/filtered_data.csv')

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112698 entries, 0 to 112697
Data columns (total 48 columns):
 #   Column                                                                          Non-Null Count   Dtype  
---  ------                                                                          --------------   -----  
 0   Date                                                                            112698 non-null  object 
 1   Price                                                                           112698 non-null  float64
 2   Twitter Followers 24h                                                           112698 non-null  float64
 3   Gini Index                                                                      112698 non-null  float64
 4   Age Consumed                                                                    112698 non-null  float64
 5   Dormant Circulation (90d)                                                       112698 non-null  float64
 6   Acti

In [82]:
# List of columns to log-transform (excluding Date and ticker)
cols_to_transform = df.columns.drop(['Date', 'ticker'])

# Apply log(1 + x) transformation
# Apply signed log transformation to variables
for col in cols_to_transform:
    df[col] = np.sign(df[col]) * np.log1p(abs(df[col]))


In [83]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])


In [84]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112698 entries, 0 to 112697
Data columns (total 48 columns):
 #   Column                                                                          Non-Null Count   Dtype              
---  ------                                                                          --------------   -----              
 0   Date                                                                            112698 non-null  datetime64[ns, UTC]
 1   Price                                                                           112698 non-null  float64            
 2   Twitter Followers 24h                                                           112698 non-null  float64            
 3   Gini Index                                                                      112698 non-null  float64            
 4   Age Consumed                                                                    112698 non-null  float64            
 5   Dormant Circulation (90d)     

In [85]:
# Calculate VIF for each independent variable

# Sort by ticker and Date
df = df.sort_values(by=['ticker', 'Date']).reset_index(drop=True)
# Select independent variables (exclude 'Price')
independent_vars = df.drop(columns=['Price', 'Date', 'ticker'])

# Define the correlation threshold
correlation_threshold = 0.95

# Calculate the correlation matrix
correlation_matrix = independent_vars.corr()

# Create a mask to filter upper triangle of the correlation matrix (excluding self-correlation)
upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)

# Identify columns to drop (those with a correlation > threshold)
columns_to_drop = [
    column for column in upper_triangle.columns if any(upper_triangle[column].abs() > correlation_threshold)
]

# Drop the highly correlated variables
independent_vars_filtered = independent_vars.drop(columns=columns_to_drop)


# Display the filtered DataFrame
independent_vars_filtered.dropna(inplace=True)
vif_data = pd.DataFrame()
vif_data['Variable'] = independent_vars_filtered.columns
vif_data['VIF'] = [variance_inflation_factor(independent_vars_filtered.values, i) for i in range(independent_vars_filtered.shape[1])]

# Identify variables with VIF > 10
high_vif_vars = vif_data[vif_data['VIF'] > 10]['Variable']

# Drop these variables from the dataset
independent_vars_filtered = independent_vars_filtered.drop(columns=high_vif_vars)

# Ensure MultiIndex is set for the main DataFrame
df = df.set_index(['ticker', 'Date'])

# Ensure the filtered independent variables have the same index as df
independent_vars_filtered.index = df.index

# Combine dependent and independent variables
panel_data = pd.concat([df[['Price']], independent_vars_filtered], axis=1)

# Drop any rows with NaNs
panel_data = panel_data.dropna()

# Confirm the MultiIndex structure
print(panel_data.index.names)  # Should show ['ticker', 'Date']



Dropped variables due to high VIF:
['Age Consumed', 'Dormant Circulation (90d)', 'Active Addresses 24h', 'Circulation', 'Circulation (1d)', 'Circulation (30d)', 'Circulation (180d)', 'Transaction Count', 'Transaction Volume USD', 'Velocity', 'Mean Coin Age', 'Mean Coin Age (90d)', 'Mean Coin Age (365d)', 'Mean Dollar Invested Age (90d)', 'Mean Dollar Invested Age (365d)', 'NVT Ratio (with Circulation)', 'Realized Cap', 'Realized Cap (30d)', 'Total Supply in Profit', 'Percent of Total Supply in Profit', 'Supply on Exchanges', 'Supply on Exchanges (as % of total supply)', 'Whale Transaction Count (>100k USD)', 'Whale Transaction Count (>1m USD)', 'Percent of Stablecoin Total Supply held by Whales with more than 5 million USD']
Remaining variables:
['Twitter Followers 24h', 'Gini Index', 'The Ratio of Daily On-Chain Transaction Volume in Profit to Loss', 'Mean Dollar Invested Age', 'MVRV Long/Short Difference', 'MVRV Ratio', 'MVRV Ratio (1d)', 'MVRV Ratio (7d)', 'MVRV Ratio (90d)', 'MVRV 

In [86]:
panel_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Twitter Followers 24h,Gini Index,The Ratio of Daily On-Chain Transaction Volume in Profit to Loss,Mean Dollar Invested Age,MVRV Long/Short Difference,MVRV Ratio,MVRV Ratio (1d),MVRV Ratio (7d),MVRV Ratio (90d),MVRV Ratio (Z score),Network Realized Profit/Loss,Realized Cap HODL Waves (0d to 1d),Realized Cap HODL Waves (30d to 60d),Exchange Flow Balance
ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAVE,2024-09-27 00:00:00+00:00,4.943897,6.137727,0.686513,0.645766,6.443252,0.234398,0.332413,-0.028062,-0.099285,0.123538,0.396857,15.901135,0.804582,1.694243,-10.305385
AAVE,2024-09-28 00:00:00+00:00,4.943897,-3.135494,0.686513,0.645766,6.443252,0.234398,0.332413,-0.028062,-0.099285,0.123538,0.396857,15.901135,0.804582,1.694243,-10.305385
AAVE,2024-09-29 00:00:00+00:00,4.943897,5.303305,0.686513,0.645766,6.443252,0.234398,0.332413,-0.028062,-0.099285,0.123538,0.396857,15.901135,0.804582,1.694243,-10.305385
AAVE,2024-09-30 00:00:00+00:00,4.943897,5.937536,0.686513,0.645766,6.443252,0.234398,0.332413,-0.028062,-0.099285,0.123538,0.396857,15.901135,0.804582,1.694243,-10.305385
AAVE,2024-10-01 00:00:00+00:00,4.943897,5.541264,0.686513,0.645766,6.443252,0.234398,0.332413,-0.028062,-0.099285,0.123538,0.396857,15.901135,0.804582,1.694243,-10.305385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZRX,2024-11-16 00:00:00+00:00,0.358314,-5.347108,0.688197,0.891546,6.835094,-0.332062,-0.221164,0.026249,0.002590,0.148644,-0.615031,14.949580,0.334728,1.207860,-14.004095
ZRX,2024-11-17 00:00:00+00:00,0.358314,-4.521789,0.688197,0.891546,6.835094,-0.332062,-0.221164,0.026249,0.002590,0.148644,-0.615031,14.949580,0.334728,1.207860,-14.004095
ZRX,2024-11-18 00:00:00+00:00,0.358314,5.123964,0.688197,0.891546,6.835094,-0.332062,-0.221164,0.026249,0.002590,0.148644,-0.615031,14.949580,0.334728,1.207860,-14.004095
ZRX,2024-11-19 00:00:00+00:00,0.358314,5.262690,0.688197,0.891546,6.835094,-0.332062,-0.221164,0.026249,0.002590,0.148644,-0.615031,14.949580,0.334728,1.207860,-14.004095


In [None]:
# Define dependent and independent variables
dependent = panel_data['Price']
independent = panel_data.drop(columns=['Price'])

# Add a constant term to the independent variables
independent = independent.assign(constant=1)

# Run the fixed-effects regression
fixed_effects_model = PanelOLS(dependent, independent, entity_effects=True)
results = fixed_effects_model.fit()

# Display results
print(results.summary)


# TODO, check again, add control variables volume and token age 

                          PanelOLS Estimation Summary                           
Dep. Variable:                  Price   R-squared:                        0.2038
Estimator:                   PanelOLS   R-squared (Between):              0.0103
No. Observations:              112698   R-squared (Within):               0.2038
Date:                Thu, Nov 21 2024   R-squared (Overall):              0.0403
Time:                        14:09:54   Log-likelihood                -7.376e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      2058.4
Entities:                          93   P-value                           0.0000
Avg Obs:                       1211.8   Distribution:               F(14,112591)
Min Obs:                       55.000                                           
Max Obs:                       4750.0   F-statistic (robust):             2058.4
                            