# Creating the correlation field

## Libraries

In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import numpy as np
pd.set_option('display.max_rows', None)

In [67]:
# Function to load JSON
def loadJSON(filepath):
    with open(filepath) as file:
        return json.load(file)

In [68]:
df = pd.DataFrame(loadJSON('../../data/json/tokens-pre-final.json'))

print(df.shape)
df.head()

(76804, 10)


Unnamed: 0,Date,Name,Token,Class,TVL,Price,Market cap,tvl_timestamp,price_timestamp,cap_timestamp
0,24/09/2023,Bitcoin,BTC,4,161061900.0,26261.44,517968300000.0,1695514000.0,1695514000.0,1695514000.0
1,25/09/2023,Bitcoin,BTC,4,154684500.0,26297.93,511787700000.0,1695600000.0,1695600000.0,1695600000.0
2,26/09/2023,Bitcoin,BTC,4,153647800.0,26212.59,512103400000.0,1695686000.0,1695686000.0,1695686000.0
3,27/09/2023,Bitcoin,BTC,4,148104600.0,26360.2,511054300000.0,1695773000.0,1695773000.0,1695773000.0
4,28/09/2023,Bitcoin,BTC,4,147845400.0,27027.2,513713600000.0,1695859000.0,1695859000.0,1695859000.0


## Calculation correlation between pairs

In [69]:
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Set Date as index (for rolling operations)
df.set_index('Date', inplace=True)

# Group by 'Token' and calculate rolling correlation for each pair of columns
def calculate_correlation(group):
    correlation_results = {}
    correlation_results['TVL_Price'] = group['TVL'].rolling(window=15).corr(group['Price'])
    correlation_results['TVL_Market_Cap'] = group['TVL'].rolling(window=15).corr(group['Market cap'])
    correlation_results['Price_Market_Cap'] = group['Price'].rolling(window=15).corr(group['Market cap'])
    
    return pd.DataFrame(correlation_results)

# Apply the function to each group
correlation_df = df.groupby('Token').apply(calculate_correlation).reset_index(level=0, drop=True)

# Combine the correlation results with the original DataFrame
df = pd.concat([df, correlation_df], axis=1)

# Fill NaN values with 0 (or handle as needed)
df.fillna(0, inplace=True)
df.replace([np.inf, -np.inf], 0, inplace=True)

# Reset the index to make Date a regular column again
df.reset_index(inplace=True)

  correlation_df = df.groupby('Token').apply(calculate_correlation).reset_index(level=0, drop=True)


In [70]:
# Checking the generated values
df.head(50)

Unnamed: 0,Date,Name,Token,Class,TVL,Price,Market cap,tvl_timestamp,price_timestamp,cap_timestamp,TVL_Price,TVL_Market_Cap,Price_Market_Cap
0,2023-09-24,Bitcoin,BTC,4,161061900.0,26261.44,517968300000.0,1695514000.0,1695514000.0,1695514000.0,0.0,0.0,0.0
1,2023-09-25,Bitcoin,BTC,4,154684500.0,26297.93,511787700000.0,1695600000.0,1695600000.0,1695600000.0,0.0,0.0,0.0
2,2023-09-26,Bitcoin,BTC,4,153647800.0,26212.59,512103400000.0,1695686000.0,1695686000.0,1695686000.0,0.0,0.0,0.0
3,2023-09-27,Bitcoin,BTC,4,148104600.0,26360.2,511054300000.0,1695773000.0,1695773000.0,1695773000.0,0.0,0.0,0.0
4,2023-09-28,Bitcoin,BTC,4,147845400.0,27027.2,513713600000.0,1695859000.0,1695859000.0,1695859000.0,0.0,0.0,0.0
5,2023-09-29,Bitcoin,BTC,4,153470200.0,26909.4,526917700000.0,1695946000.0,1695946000.0,1695946000.0,0.0,0.0,0.0
6,2023-09-30,Bitcoin,BTC,4,153421400.0,26964.27,524946700000.0,1696032000.0,1696032000.0,1696032000.0,0.0,0.0,0.0
7,2023-10-01,Bitcoin,BTC,4,152966700.0,27994.51,525936300000.0,1696118000.0,1696118000.0,1696118000.0,0.0,0.0,0.0
8,2023-10-02,Bitcoin,BTC,4,159918500.0,27507.22,545302100000.0,1696205000.0,1696205000.0,1696205000.0,0.0,0.0,0.0
9,2023-10-03,Bitcoin,BTC,4,166990500.0,27428.66,538323500000.0,1696291000.0,1696291000.0,1696291000.0,0.0,0.0,0.0


## Calculating the average correlation

In [71]:
# Calculating the average correlation
df['correlation_average'] = df[['TVL_Price', 'TVL_Market_Cap', 'Price_Market_Cap']].mean(axis=1)

df.head(20)

Unnamed: 0,Date,Name,Token,Class,TVL,Price,Market cap,tvl_timestamp,price_timestamp,cap_timestamp,TVL_Price,TVL_Market_Cap,Price_Market_Cap,correlation_average
0,2023-09-24,Bitcoin,BTC,4,161061900.0,26261.44,517968300000.0,1695514000.0,1695514000.0,1695514000.0,0.0,0.0,0.0,0.0
1,2023-09-25,Bitcoin,BTC,4,154684500.0,26297.93,511787700000.0,1695600000.0,1695600000.0,1695600000.0,0.0,0.0,0.0,0.0
2,2023-09-26,Bitcoin,BTC,4,153647800.0,26212.59,512103400000.0,1695686000.0,1695686000.0,1695686000.0,0.0,0.0,0.0,0.0
3,2023-09-27,Bitcoin,BTC,4,148104600.0,26360.2,511054300000.0,1695773000.0,1695773000.0,1695773000.0,0.0,0.0,0.0,0.0
4,2023-09-28,Bitcoin,BTC,4,147845400.0,27027.2,513713600000.0,1695859000.0,1695859000.0,1695859000.0,0.0,0.0,0.0,0.0
5,2023-09-29,Bitcoin,BTC,4,153470200.0,26909.4,526917700000.0,1695946000.0,1695946000.0,1695946000.0,0.0,0.0,0.0,0.0
6,2023-09-30,Bitcoin,BTC,4,153421400.0,26964.27,524946700000.0,1696032000.0,1696032000.0,1696032000.0,0.0,0.0,0.0,0.0
7,2023-10-01,Bitcoin,BTC,4,152966700.0,27994.51,525936300000.0,1696118000.0,1696118000.0,1696118000.0,0.0,0.0,0.0,0.0
8,2023-10-02,Bitcoin,BTC,4,159918500.0,27507.22,545302100000.0,1696205000.0,1696205000.0,1696205000.0,0.0,0.0,0.0,0.0
9,2023-10-03,Bitcoin,BTC,4,166990500.0,27428.66,538323500000.0,1696291000.0,1696291000.0,1696291000.0,0.0,0.0,0.0,0.0


In [72]:
# Checking basic stats
df['correlation_average'].describe()

count    76804.000000
mean         0.456985
std          0.358797
min         -0.381497
25%          0.113398
50%          0.491288
75%          0.802636
max          0.996469
Name: correlation_average, dtype: float64

## Generating the Correlation field

In [73]:
# Defining the threshold
threshold = 0.8
# Generating the column
df['Correlation'] = (df['correlation_average'] >= threshold).astype(int)

In [74]:
df.head(20)

Unnamed: 0,Date,Name,Token,Class,TVL,Price,Market cap,tvl_timestamp,price_timestamp,cap_timestamp,TVL_Price,TVL_Market_Cap,Price_Market_Cap,correlation_average,Correlation
0,2023-09-24,Bitcoin,BTC,4,161061900.0,26261.44,517968300000.0,1695514000.0,1695514000.0,1695514000.0,0.0,0.0,0.0,0.0,0
1,2023-09-25,Bitcoin,BTC,4,154684500.0,26297.93,511787700000.0,1695600000.0,1695600000.0,1695600000.0,0.0,0.0,0.0,0.0,0
2,2023-09-26,Bitcoin,BTC,4,153647800.0,26212.59,512103400000.0,1695686000.0,1695686000.0,1695686000.0,0.0,0.0,0.0,0.0,0
3,2023-09-27,Bitcoin,BTC,4,148104600.0,26360.2,511054300000.0,1695773000.0,1695773000.0,1695773000.0,0.0,0.0,0.0,0.0,0
4,2023-09-28,Bitcoin,BTC,4,147845400.0,27027.2,513713600000.0,1695859000.0,1695859000.0,1695859000.0,0.0,0.0,0.0,0.0,0
5,2023-09-29,Bitcoin,BTC,4,153470200.0,26909.4,526917700000.0,1695946000.0,1695946000.0,1695946000.0,0.0,0.0,0.0,0.0,0
6,2023-09-30,Bitcoin,BTC,4,153421400.0,26964.27,524946700000.0,1696032000.0,1696032000.0,1696032000.0,0.0,0.0,0.0,0.0,0
7,2023-10-01,Bitcoin,BTC,4,152966700.0,27994.51,525936300000.0,1696118000.0,1696118000.0,1696118000.0,0.0,0.0,0.0,0.0,0
8,2023-10-02,Bitcoin,BTC,4,159918500.0,27507.22,545302100000.0,1696205000.0,1696205000.0,1696205000.0,0.0,0.0,0.0,0.0,0
9,2023-10-03,Bitcoin,BTC,4,166990500.0,27428.66,538323500000.0,1696291000.0,1696291000.0,1696291000.0,0.0,0.0,0.0,0.0,0


In [75]:
# Checking the amount of values
df['Correlation'].value_counts()

Correlation
0    57335
1    19469
Name: count, dtype: int64

In [76]:
# Just selecting the important features
final_df = df[['Date', 'Token', 'Name', 'TVL', 'Market cap', 'Price', 'Correlation', 'Class']]
final_df.head()

Unnamed: 0,Date,Token,Name,TVL,Market cap,Price,Correlation,Class
0,2023-09-24,BTC,Bitcoin,161061900.0,517968300000.0,26261.44,0,4
1,2023-09-25,BTC,Bitcoin,154684500.0,511787700000.0,26297.93,0,4
2,2023-09-26,BTC,Bitcoin,153647800.0,512103400000.0,26212.59,0,4
3,2023-09-27,BTC,Bitcoin,148104600.0,511054300000.0,26360.2,0,4
4,2023-09-28,BTC,Bitcoin,147845400.0,513713600000.0,27027.2,0,4


In [77]:
# Converting to date
final_df['Date'] = pd.to_datetime(final_df['Date']).dt.strftime('%d-%m-%Y')

final_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Date'] = pd.to_datetime(final_df['Date']).dt.strftime('%d-%m-%Y')


Unnamed: 0,Date,Token,Name,TVL,Market cap,Price,Correlation,Class
0,24-09-2023,BTC,Bitcoin,161061900.0,517968300000.0,26261.44,0,4
1,25-09-2023,BTC,Bitcoin,154684500.0,511787700000.0,26297.93,0,4
2,26-09-2023,BTC,Bitcoin,153647800.0,512103400000.0,26212.59,0,4
3,27-09-2023,BTC,Bitcoin,148104600.0,511054300000.0,26360.2,0,4
4,28-09-2023,BTC,Bitcoin,147845400.0,513713600000.0,27027.2,0,4


In [78]:
# Saving in a JSON file
final_df.to_json('../../data/json/tokens-final.json', orient='records', indent=4)