In [None]:
import os
from ib_async import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
import dcor
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict
import itertools
from time import sleep
import csv
from scipy.optimize import minimize
from fredapi import Fred
import pandas_datareader.data as web
import math
import re
import ast


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
# Prep functions
def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

In [None]:
# kind = 'midpoint'
kind = 'trades'
# kind = 'indices'

if kind == 'midpoint':
    root = 'data/daily-midpoint/'
elif kind == 'trades':
    root = 'data/daily-trades/'
elif kind == 'indices':
    root = 'data/indices/'

data_path = root + 'series/'
verified_path = root + 'verified_files.csv'

if kind in ['trades', 'indices']:
    price_col = 'average'
else:
    price_col = 'close'

In [None]:
# Verify files
fund_df = load('data/fundamentals.csv')

try:
    verified_df = pd.read_csv(verified_path)
except FileNotFoundError:
    util.startLoop()
    ib = IB()
    ib.connect('127.0.0.1', 7497, clientId=2)

    file_list = os.listdir(data_path)
    verified_files = []

    for file_name in tqdm(file_list, total=len(file_list), desc="Verifying files"):
        if not file_name.endswith('.csv'):
            continue
        try:
            symbol, exchange, currency = file_name.replace('.csv', '').split('-')
            symbol_data = fund_df[(fund_df['symbol'] == symbol) & (fund_df['currency'] == currency)]
            if symbol_data.empty:
                continue

            contract_details = ib.reqContractDetails(Stock(symbol, exchange, currency))
            if not contract_details:
                continue
            isin = contract_details[0].secIdList[0].value

            if symbol_data['isin'].iloc[0] != isin:
                continue

            instrument_name = symbol_data['longName'].iloc[0].replace('-', '').replace('+', '')
            leveraged = any(
                re.fullmatch(r'\d+X', word) and int(word[:-1]) > 1 or word.lower().startswith(('lv', 'lev'))
                for word in instrument_name.split()
            )
            if leveraged:
                continue

            verified_files.append({'symbol': symbol, 'currency': currency})
        except ValueError as e:
            print(f"Invalid filename format {file_name}: {e}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    verified_df = pd.DataFrame(verified_files)
    verified_df.to_csv(verified_path, index=False)

    ib.disconnect()

### Merge historical series with fundamentals

In [None]:
def clean_series(df):
    # Ensure dates and numeric types
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)

    # for col in ['volume', price_col]:
    #     df[col] = pd.to_numeric(df[col], errors='coerce')

    df[price_col] = pd.to_numeric(df[price_col], errors='coerce')

    return df[['date', price_col]]
    
    
    # # Basic consistency checks (NOT NECESSARY FOR TRADES DATA)
    # df['negatives'] = (df[price_col] < 0) | (df['volume'] < 0) # | (df['low'] < 0) | (df['high'] < 0)
    # df['inconsistent'] = (df['low'] > df['high']) | (df[price_col] < df['low']) | (df[price_col] > df['high'])
    
    # # Outlier detection
    # df['total_outlier'] = (df[price_col] > (df[price_col].median() + df[price_col].std() * z)) | (df[price_col] < (df[price_col].median() - df[price_col].std() * z)) 
    
    # rolling_median = df[price_col].rolling(window=window, center=True, min_periods=1).mean()
    # rolling_std = df[price_col].rolling(window=window, center=True, min_periods=1).std()
    # std_threshold = z * rolling_std
    # df['std_outlier'] = np.abs(df[price_col] - rolling_median) > std_threshold

    # rolling_mad = df[price_col].rolling(window=window, center=True, min_periods=1).apply(lambda x: np.mean(np.abs(x - np.median(x))), raw=True)
    # mad_threshold = z * rolling_mad
    # df['mad_outlier'] = np.abs(df[price_col] - rolling_median) > mad_threshold
    
    # rolling_iqr = df[price_col].rolling(window=window, center=True, min_periods=1).apply(lambda x: np.subtract(*np.percentile(x, [75, 25])), raw=True)
    # iqr_threshold = z * rolling_iqr
    # df['iqr_outlier'] = np.abs(df[price_col] - rolling_median) > iqr_threshold

    # df['all'] = df['std_outlier'] & df['mad_outlier'] & df['iqr_outlier']

    # return df, len(df), df['negatives'].sum(), df['inconsistent'].sum(), df['total_outlier'].sum(), df['std_outlier'].sum(), df['mad_outlier'].sum(), df['iqr_outlier'].sum(), df['all'].sum()

In [None]:
# Load historical series
latest = (datetime.now() - timedelta(days=365 * 6))
meta = []
file_list = os.listdir(data_path)
for file in tqdm(file_list, total=len(file_list)):
    if file.endswith('.csv'):
        parts = os.path.splitext(file)[0].split('-')
        symbol = parts[0]
        exchange = parts[1]
        currency = parts[2]
    else:
        continue
    if not ((verified_df['symbol'] == symbol) & (verified_df['currency'] == currency)).any():
        continue
    
    # Load and clean raw series
    df = pd.read_csv(data_path + file)
    df = clean_series(df)

    if df['date'].max() > latest:
        latest = df['date'].max()

    meta.append({
        'symbol': symbol,
        'exchange_api': exchange,
        'currency': currency,
        'df': df,
    })
meta = pd.DataFrame(meta)
latest

In [None]:
# Calculate series gap statistics
oldest = latest - pd.Timedelta(days=365 * 6)
business_days = pd.date_range(start=oldest, end=latest, freq='B')
business_days = pd.DataFrame({'date': business_days})

# Calculate statistics for each DataFrame in meta
for idx, row in tqdm(meta.iterrows(), total=len(meta)):
    df = row['df']
    merged = pd.merge(business_days, df, on='date', how='left')
    
    # Calculate gaps
    present = merged[price_col].notna()
    present_idx = np.flatnonzero(present)
    gaps = []
    length = len(merged)

    if present_idx.size > 0:
        if present_idx[0] > 0:
            gaps.append(present_idx[0])
        if present_idx.size > 1:
            internal_gaps = np.diff(present_idx) - 1
            gaps.extend(gap for gap in internal_gaps if gap > 0)
        if present_idx[-1] < length - 1:
            gaps.append(length - 1 - present_idx[-1])
    else:
        gaps = [length]

    gaps = np.array(gaps, dtype=int)
    gaps = gaps[gaps > 0]
    max_gap = float(gaps.max()) if gaps.size > 0 else 0.0
    std_gap = float(gaps.std()) if gaps.size > 0 else 0.0
    missing = length - present.sum()
    pct_missing = missing / length

    # Update meta with statistics
    meta.at[idx, 'df'] = merged
    meta.at[idx, 'max_gap'] = max_gap
    meta.at[idx, 'missing'] = missing
    meta.at[idx, 'pct_missing'] = pct_missing

In [None]:
# Remove bad series and interpolate/extrapolate
meta['max_gap_log'] = np.log1p(meta['max_gap'])
meta['max_gap_log'] = meta['max_gap_log'] / meta['max_gap_log'].max()
meta['exclusion_score'] = meta['pct_missing'] + meta['max_gap_log']

condition = ((meta['max_gap_log'] < meta['max_gap_log'].mean()) & 
             (meta['pct_missing'] < meta['pct_missing'].mean()))
filtered = meta[condition].sort_values(by='exclusion_score', ascending=False).copy()

# Interpolate/extrapolate price column
for idx, row in tqdm(filtered.iterrows(), total=len(filtered)):
    df = row['df']
    df[price_col] = df[price_col].interpolate(method='akima', limit_direction='both')
    if df[price_col].isna().any():
        df[price_col] = df[price_col].ffill()
        df[price_col] = df[price_col].bfill()
    
    df['pct_change'] = df[price_col].pct_change()
    filtered.at[idx, 'df'] = df

filtered = pd.merge(filtered, fund_df, on=['symbol', 'currency'], how='inner').drop(['max_gap', 'missing', 'pct_missing', 'exclusion_score', 'max_gap_log'], axis=1)

In [None]:
# # Manual plot series
# df = meta.iloc[0]['df'].copy()
# # display(df)

# # Step 5: Forward fill missing values (optional, adjust as needed)
# # df[price_col] = df[price_col].fillna(0)

# plt.figure(figsize=(10, 6))
# plt.plot(df['date'], df[price_col], marker='o')
# plt.xlim(df['date'].min(), df['date'].max())
# plt.ylim(0, df[price_col].max()*1.1)
# plt.show()

In [None]:
# # Delete duplicates
# duplicates = meta[meta.duplicated(subset=['symbol', 'currency'], keep=False)].copy()
# duplicates['not_smart'] = duplicates['exchange_api'] != 'SMART'

# sorted_duplicates = duplicates.sort_values(
#     by=['symbol', 'currency', 'length', 'not_smart'],
#     ascending=[True, True, False, False]
# )

# rows_to_keep = sorted_duplicates.groupby(['symbol', 'currency']).head(1)
# rows_to_delete = duplicates[~duplicates.index.isin(rows_to_keep.index)]
# for idx, row in rows_to_delete.iterrows():
#     file_name = f"{row['symbol']}-{row['exchange_api']}-{row['currency']}.csv"
#     file_path = os.path.join(data_path, file_name)
#     if os.path.exists(file_path):
#         os.remove(file_path)
#         print(f"Deleted {file_path}")
#     else:
#         print(f"File not found: {file_path}")

# del duplicates, sorted_duplicates, rows_to_keep, rows_to_delete

# Plot asset class portfolios

In [None]:
# Prepare for market portfolios
cols_to_exclude = ['length', 'conId', 'tradable']
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

pct_changes = pd.concat(
        [row['df'].set_index('date')['pct_change'].rename(row['symbol']) 
        for _, row in filtered.iterrows()], axis=1
    )

In [None]:
# Prepare for market portfolios
cols_to_exclude = ['length', 'conId', 'tradable']
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

pct_changes = pd.concat(
        [row['df'].set_index('date')['pct_change'].rename(row['symbol']) 
        for _, row in filtered.iterrows()], axis=1
    )

all_zero_cols = [col for col in numerical_cols if (filtered[col] == 0).all()]
filtered = filtered.drop(columns=all_zero_cols)
# filtered = filtered.dropna(axis=1)

numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

In [None]:
# Plot all holding types
holding_cols = [col for col in filtered.columns if col.startswith('holding_') and col != 'holding_types_variety'] + ['total']
for holding_col in holding_cols:
    if holding_col == 'total':
        total_market_cap = (filtered['profile_cap_usd']).sum()
        filtered['weight'] = filtered['profile_cap_usd'] / total_market_cap
    else:
        total_market_cap = (filtered['profile_cap_usd'] * filtered[holding_col]).sum()
        filtered['weight'] = filtered['profile_cap_usd'] * filtered[holding_col] / total_market_cap
    print(f"{holding_col}: {format(total_market_cap, ",.2f")}")

    portfolio = pd.Series(index=numerical_cols, data=np.nan)
    for col in numerical_cols:
        weighted_sum = (filtered[col] * filtered['weight']).sum()
        portfolio[col] = weighted_sum

    weights = filtered.set_index('symbol')['weight']
    portfolio_return = pct_changes.dot(weights)

    initial_price = 100
    portfolio_price = initial_price * (1 + portfolio_return.fillna(0)).cumprod()

    portfolio_df = pd.DataFrame({
        'date': portfolio_price.index,
        price_col: portfolio_price.values,
        'pct_change': portfolio_return.values
    }).reset_index(drop=True)

    plt.figure(figsize=(10, 6))
    plt.title(f"{holding_col.split('_')[-1]}")
    plt.plot(portfolio_df['date'], portfolio_df[price_col], marker='o')
    plt.show()
    # display(pd.DataFrame(portfolio))

# Factor analysis

In [None]:
filtered

In [None]:
filtered.sort_values(by='weight', ascending=False)#[filtered['symbol'] == 'SPY']
# '''
# AveragCoupon : 1816
# AvgQuality : 1838
# debtors. maturity, adn debt type: 1730
# '''

In [None]:
symbol_test = 'ZSIL'

x = filtered[filtered['symbol'] == symbol_test].df.iloc[0]['date']
y = filtered[filtered['symbol'] == symbol_test].df.iloc[0]['average']

plt.figure(figsize=(10, 6))
plt.plot(x, y, marker='o')
# plt.xlim(market_portfolio_df['date'].min(), market_portfolio_df['date'].max())
plt.show()

In [None]:
import numpy as np

def brownian_bridge(t, t0, t1, x0, x1, sigma):
    """Generate points using Brownian bridge between (t0, x0) and (t1, x1)."""
    dt = t1 - t0
    mu = x0 + (x1 - x0) * (t - t0) / dt  # Linear interpolation for mean
    variance = sigma**2 * (t1 - t) * (t - t0) / dt
    return mu + np.random.normal(0, np.sqrt(variance))

# Example
t = np.linspace(0, 10, 11)  # Original time points
prices = np.random.normal(100, 5, len(t))  # Simulated price series
sigma = np.std(prices)  # Variance of the series

# Interpolate to finer grid
t_new = np.linspace(0, 10, 21)  # New time points
prices_new = np.zeros(len(t_new))

# Copy original points and interpolate gaps
for i in range(len(t) - 1):
    idx = np.where((t_new >= t[i]) & (t_new <= t[i+1]))[0]
    for j in idx:
        prices_new[j] = brownian_bridge(t_new[j], t[i], t[i+1], prices[i], prices[i+1], sigma)

# Verify variance
print("Original variance:", np.var(prices))
print("Interpolated variance:", np.var(prices_new))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import CubicSpline

def brownian_bridge(t, t0, t1, x0, x1, sigma):
    """Generate points using Brownian bridge between (t0, x0) and (t1, x1)."""
    dt = t1 - t0
    mu = x0 + (x1 - x0) * (t - t0) / dt  # Linear interpolation for mean
    variance = sigma**2 * (t1 - t) * (t - t0) / dt
    return mu + np.random.normal(0, np.sqrt(variance))

# Generate original price series
t = np.linspace(0, 10, 11)  # Original time points
prices = np.random.normal(100, 5, len(t))  # Simulated price series
sigma = np.std(prices)  # Standard deviation for Brownian bridge

# Interpolate to finer grid
t_new = np.linspace(0, 10, 21)  # New time points
prices_new = np.zeros(len(t_new))  # Brownian bridge interpolation
prices_lin = np.zeros(len(t_new))  # Linear interpolation
prices_spl = np.zeros(len(t_new))  # Spline interpolation

# Brownian bridge interpolation
for i in range(len(t) - 1):
    idx = np.where((t_new >= t[i]) & (t_new <= t[i+1]))[0]
    for j in idx:
        prices_new[j] = brownian_bridge(t_new[j], t[i], t[i+1], prices[i], prices[i+1], sigma)

# Linear interpolation
prices_lin = np.interp(t_new, t, prices)

# Spline interpolation
spline = CubicSpline(t, prices)
prices_spl = spline(t_new)

# Verify variances
print("Original variance:", np.var(prices))
print("Brownian bridge variance:", np.var(prices_new))
print("Linear interpolation variance:", np.var(prices_lin))
print("Spline interpolation variance:", np.var(prices_spl))

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(t, prices, 'o-', label='Original Prices', markersize=8)
plt.plot(t_new, prices_new, 'x-', label='Brownian Bridge Interpolation')
plt.plot(t_new, prices_lin, 's-', label='Linear Interpolation')
plt.plot(t_new, prices_spl, 'd-', label='Spline Interpolation')
plt.title('Price Series Interpolation Comparison')
plt.xlabel('Time')
plt.ylabel('Price')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
prices_new

In [None]:
# Graph correlations
import seaborn as sns
import matplotlib.pyplot as plt


numerical_cols = [col for col in meta.columns if meta[col].dtype in [np.int64, np.float64, bool]]
cols_to_exclude = ['length', 'conId']
numerical_cols = [col for col in numerical_cols if col not in cols_to_exclude]

# drop columns with missing values
corr_df = meta[numerical_cols].corr()
# corr_df.dropna(axis=1, how='all', inplace=True)
# corr_df.dropna(axis=0, how='all', inplace=True)

plt.figure(figsize=(50, 50))
sns.heatmap(corr_df, cmap='coolwarm')
plt.show()

---
### Prep historical data
---

In [None]:
# # Test day gap
# dfs = {}
# for file in os.listdir(data_path):
#     symbol = os.path.splitext(file)[0].split('-')[0]
#     if symbol in verified_files:
#         dfs[symbol] = pd.read_csv(data_path + file)

# days, nums, lens, firsts = [], [], [], []
# for day in range(5,30):
#     days.append(day)

#     melted_dfs = []
#     expected_returns = {}
#     for symbol, df in tqdm(dfs.items(), total=len(dfs), desc=f'{day}'):
#         df = melt(df)
#         df['date'] = pd.to_datetime(df['date'])

#         latest_date = df['date'].iloc[-1]
#         earliest_date = df['date'].iloc[0]
#         length_required = pd.to_datetime('2020-02-01')
#         month_ago = datetime.today() - timedelta(days=30)

#         dates = df['date'].unique()
#         date_diffs = dates[1:] - dates[:-1]
        
#         if latest_date >= month_ago and earliest_date <= length_required and not (date_diffs > pd.Timedelta(days=day)).any():
#             df['symbol'] = symbol
#             df['pct_change'] = df['value'].pct_change()
#             expected_returns[symbol] = df['pct_change'].mean()
#             melted_dfs.append(df)
#     # print(f'Loaded {len(melted_dfs)} out of {len(file_list)} series ({round(len(melted_dfs)/len(file_list)*100, 4)}%)')

#     # Concatenate and pivot data
#     returns_df = pd.concat(melted_dfs, ignore_index=True)
#     returns_df = returns_df.pivot(index=['date', 'kind'], columns='symbol', values='pct_change')
#     returns_df = returns_df.sort_values(by=['date', 'kind'], ascending=[True, False]).reset_index().dropna()
#     lens.append(len(returns_df))
#     nums.append(len(returns_df.columns))
#     firsts.append(returns_df.date.iloc[0])

# gap_data_df = pd.DataFrame({
#     'day_gap': days,
#     'num_etfs': nums,
#     'period_length': lens,
#     'first_day':firsts})

# gap_data_df

In [None]:
# Load and prepare historical training data
# def melt(data_df, value_columns=None):
#     if not value_columns:
#         value_columns = ['open', 'close']
#     id_columns = [col for col in data_df.columns.to_list() if col not in value_columns]
#     melted_df = data_df.melt(id_vars=id_columns, value_vars=value_columns, var_name='kind', value_name='value')
#     return melted_df.sort_values(by=['date'], ascending=[True, False]).reset_index(drop=True)

# Load historical data and merge them all into one df
dfs = {}
file_list = os.listdir(data_path)
for file in file_list:
    symbol = os.path.splitext(file)[0].split('-')[0]
    if symbol in verified_files:
        dfs[symbol] = pd.read_csv(data_path + file)


# Melt dfs, filters, and calc pct_change. ASSUMES that dfs are sorted chronologically
training_start_date = pd.to_datetime('2020-02-01')
month_ago = datetime.today() - timedelta(days=31)

day_gap = 6 # SET ACCEPTABLE DAY GAP

melted_dfs, expected_returns = [], {}
for symbol, df in tqdm(dfs.items(), total=len(dfs), desc=f'Filtering {kind} dfs'):
    df['date'] = pd.to_datetime(df['date'])

    latest_date = df['date'].iloc[-1]
    earliest_date = df['date'].iloc[0]
    dates = df['date'].unique()
    date_gaps = dates[1:] - dates[:-1]
    
    if (kind == 'indices') or (latest_date >= month_ago and earliest_date <= training_start_date and (date_gaps <= pd.Timedelta(days=day_gap)).all()):
        df['symbol'] = symbol
        df['pct_change'] = df['average'].pct_change()
        expected_returns[symbol] = df['pct_change'].mean()
        melted_dfs.append(df)
print(f'Loaded {len(melted_dfs)} out of {len(file_list)} series ({round(len(melted_dfs)/len(file_list)*100, 4)}%)')

# Concatenate and pivot data
returns_df = pd.concat(melted_dfs, ignore_index=True)
returns_df = returns_df.pivot(index=['date'], columns='symbol', values='pct_change')
returns_df = returns_df.sort_values(by=['date'], ascending=[True]).reset_index()

# Define training boundaries
training_cutoff_date = datetime.today() - timedelta(days=365)
training_df = returns_df[returns_df['date'] <= training_cutoff_date]
training_matrix = training_df.drop(['date'], axis=1).dropna().copy()

In [None]:
# Calculate risk-free-rate for training window
treasury_rate = web.DataReader('DGS10', 'fred', training_cutoff_date-timedelta(days=365), training_cutoff_date)
nominal_rf_rate = treasury_rate.mean() / 100

fred = Fred(api_key='30ae0e4e7713662116edf836cec71562')
cpi_data = fred.get_series('CPIAUCSL', training_cutoff_date-timedelta(days=365), training_cutoff_date) # CPI
inflation_rate = (cpi_data.iloc[-1] - cpi_data.iloc[0]) / cpi_data.iloc[0]

real_rf_rate = (1 + nominal_rf_rate) / (1 + inflation_rate) - 1

In [None]:
# Calculate corr and cov for historical training data
training_array = training_matrix.values # Convert training matrix to numpy array
symbol_list = training_matrix.columns.tolist()
num_symbols = len(symbol_list)
corr_matrix = np.zeros((num_symbols, num_symbols)) # Pre-allocate numpy array for correlation
cov_matrix = np.zeros((num_symbols, num_symbols))  # Pre-allocate numpy array for covariance

for i, sym_i in tqdm(enumerate(symbol_list), total=num_symbols, desc=f"Calculating distance stats sqr"):
    for j, sym_j in enumerate(symbol_list):
        if i <= j:  # Compute only for upper triangle (including diagonal)
            stats = dcor.distance_stats(training_array[:, i], training_array[:, j])
            corr_value = stats.correlation_xy
            cov_value = stats.covariance_xy

            corr_matrix[i, j] = corr_value
            corr_matrix[j, i] = corr_value  # Fill symmetric value

            cov_matrix[i, j] = cov_value
            cov_matrix[j, i] = cov_value  # Fill symmetric value

corr_df = pd.DataFrame(corr_matrix, index=symbol_list, columns=symbol_list) # Convert numpy array back to df for output
cov_df = pd.DataFrame(cov_matrix, index=symbol_list, columns=symbol_list)   # Convert numpy array back to df for output

corr_df.to_csv(f'{root}corr_df.csv', index=False)
cov_df.to_csv(f'{root}cov_df.csv', index=False)

---
### Compute etf combinations based on optimal k_clusters
---

In [None]:
# Load corr and cov
corr_df = pd.read_csv(f'{root}corr_df.csv')
cov_df = pd.read_csv(f'{root}cov_df.csv')
symbol_list = corr_df.columns

symbol2index = dict(zip(corr_df.columns, corr_df.index))
index2symbol = dict(zip(corr_df.index, corr_df.columns))
corr_df.rename(columns=symbol2index, inplace=True)
cov_df.rename(columns=symbol2index, inplace=True)

distance_matrix = (1 - corr_df).to_numpy()
np.fill_diagonal(distance_matrix, 0)

In [None]:
# Thresholds / cluster_num graphs
methods = ['single', 'ward', 'average', 'complete', 'weighted', 'centroid', 'median']
methods = ['ward']
for method in methods:
    linked = sch.linkage(squareform(distance_matrix), method=method)
    
    num_clusters = range(len(corr_df), 1, -1)
    thresholds = linked[:, 2]

    # inertias = []
    # for n_clusters in num_clusters:
    #     cluster_labels = fcluster(linked, t=n_clusters, criterion='maxclust')
    #     inertia = 0
    #     for cluster in np.unique(cluster_labels):
    #         members = distance_matrix.values[cluster_labels == cluster]
    #         centroid = members.mean(axis=0)  # Cluster centroid
    #         inertia += np.sum((members - centroid) ** 2)
    #     inertias.append(inertia)

    # plt.figure(figsize=(12, 6))
    # plt.plot(num_clusters, inertias, marker='o', label=f"Method {method}")
    # plt.title(f"Inertia/Num ({method})")
    # plt.xlabel('Number of Clusters')
    # plt.ylabel('Inertia (Sum of Squared Distances)')
    # plt.grid(True)
    # plt.legend()
    # plt.show()

    plt.figure(figsize=(12, 6))
    plt.plot(num_clusters, thresholds, marker='o')
    plt.title(f"Threshold/Num ({method})")
    plt.xlabel('Number of Clusters')
    plt.ylabel('Threshold (Distance)')
    plt.grid(True)
    plt.show()


In [None]:
# Silhouettes and dendrograms
def product(row):
    product = 1
    for value in row.values():
        product *= value
    return product

ks = []
scores = []
counts = []
for k in range(2, min(len(distance_matrix), 9)):
    clusters = AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(distance_matrix)
    score = silhouette_score(distance_matrix, clusters, metric='precomputed')
    ks.append(k)
    scores.append(score)
    unique_clusters, label_counts = np.unique(clusters, return_counts=True)
    label_counts_dict = dict(zip(unique_clusters, label_counts))
    counts.append(label_counts_dict)

silhouettes = pd.DataFrame({
    'k': ks,
    'score': scores,
    'counts': counts
})
silhouettes['combitions'] = silhouettes['counts'].apply(product)
silhouettes = silhouettes.sort_values(by='score', ascending=False)
best_k = silhouettes.k.iloc[0]

# best_k = 3

display(silhouettes)
methods = ['single', 'ward', 'average', 'complete', 'weighted', 'centroid', 'median']
methods = ['ward']
for method in methods:
    # Now compute the linkage using a condensed distance matrix
    linked = sch.linkage(squareform(distance_matrix), method=method)
    plt.figure(figsize=(20, 10))
    sch.dendrogram(linked, labels=corr_df.index, leaf_rotation=90)
    plt.title(f"Method {method}")
    plt.show()

---
### Calculate Minimum Variance Portfolios
---

In [None]:
# Portfolio Optimization Functions
def portfolio_variance(weights, cov_matrix):
    return weights.T @ cov_matrix @ weights

def portfolio_expected_return(weights, expected_returns_arr):
    return weights @ expected_returns_arr

def minimize_portfolio_variance(cov_matrix, expected_returns_arr):
    num_assets = len(cov_matrix)
    initial_weights = np.array([1/num_assets] * num_assets)
    bounds = tuple((0, 1) for _ in range(num_assets))
    constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})

    optimization_result = minimize(portfolio_variance,
                                    initial_weights,
                                    args=(cov_matrix,),
                                    method='SLSQP',
                                    bounds=bounds,
                                    constraints=constraints)

    if optimization_result.success:
        optimized_weights = optimization_result.x
        port_variance = optimization_result.fun
        port_std = np.sqrt(port_variance)
        port_er = portfolio_expected_return(optimized_weights, expected_returns_arr)

        return (optimized_weights, port_std, port_er)
    else:
        return (np.nan, np.nan, np.nan)

In [None]:
# Portfolio Optimization Functions
def compute_distance_sum(combination, distance_matrix):
    distance_sum = 0
    for i_idx, j_idx in itertools.combinations(combination, 2):
        distance_sum += distance_matrix[i_idx, j_idx]
    return distance_sum

def portfolio_variance(weights, cov_matrix):
    return weights.T @ cov_matrix @ weights

def portfolio_expected_return(weights, expected_returns_arr):
    return weights @ expected_returns_arr

def sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate):
    port_er = portfolio_expected_return(weights, expected_returns_arr)
    port_variance = portfolio_variance(weights, cov_matrix)
    port_std = np.sqrt(port_variance)
    return (port_er - risk_free_rate) / port_std

def negative_sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate):
    return -sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate)

def find_tangency_portfolio(cov_matrix, expected_returns_arr, risk_free_rate):
    num_assets = len(cov_matrix)
    initial_weights = np.array([1/num_assets] * num_assets)
    bounds = tuple((0, 1) for _ in range(num_assets))
    constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})

    optimization_result = minimize(negative_sharpe_ratio,
                                    initial_weights,
                                    args=(expected_returns_arr, cov_matrix, risk_free_rate),
                                    method='SLSQP',
                                    bounds=bounds,
                                    constraints=constraints)

    if optimization_result.success:
        optimized_weights = optimization_result.x
        variance = portfolio_variance(optimized_weights, cov_matrix)
        std = np.sqrt(variance)
        er = portfolio_expected_return(optimized_weights, expected_returns_arr)
        sharpe_ratio = -(optimization_result.fun)

        return (optimized_weights, std, er, sharpe_ratio)
    else:
        return (np.nan, np.nan, np.nan, np.nan)
    
def sort_top_combinations(array, sort_index):
    valid_rows = ~np.isnan(array[:, sort_index])
    valid_array = array[valid_rows]
    if valid_array.size > 0:
        sort_values = valid_array[:, sort_index]
        sort_indices = np.argsort(sort_values)[::-1]
        array[valid_rows] = valid_array[sort_indices]
    return array

In [None]:
num_symbols = len(corr_df.index)
num_metrics = best_k*2 + 4
num_combinations = math.comb(num_symbols, best_k)
combination_array = np.empty((num_combinations, num_metrics), dtype='float32')

# Calculate distance sums and populate the NumPy array
for i, combination in tqdm(enumerate(itertools.combinations(range(0,num_symbols), best_k)), total=num_combinations, desc="Calculating distance sums"):
    combination_array[i, :best_k] = combination
    combination_cov_df = cov_df.loc[combination, combination]
    combination_expected_returns = np.array([expected_returns[index2symbol[index]] for index in combination])

    index_indicator = best_k + best_k + 3
    combination_array[i, best_k: index_indicator] = find_tangency_portfolio(combination_cov_df, combination_expected_returns, real_rf_rate)
    combination_array[i, index_indicator: index_indicator + 1] = compute_distance_sum(combination, distance_matrix)

    # population growth rate



# TODO - not to be sorted by best_k
sorted_indices = np.argsort(combination_array[:, best_k], kind='mergesort')[::-1]
combination_array = combination_array[sorted_indices]
del sorted_indices
combination_array, len(combination_array)

In [None]:
num_symbols = len(corr_df.index)
num_metrics = best_k*2 + 4
num_combinations_possible = math.comb(num_symbols, best_k)

top_n = 5000 # Define how many top combinations to keep

top_combinations_array = np.empty((top_n, num_metrics), dtype='float32')
top_combinations_array[:] = np.nan
rows_filled = 0


for combination_tuple in tqdm(itertools.combinations(range(0,num_symbols), best_k), total=num_combinations_possible, desc="Calculating Tangency Portfolios"):
    combination_cov_df = cov_df.loc[combination_tuple, combination_tuple]
    combination_expected_returns = np.array([expected_returns[index2symbol[index]] for index in combination_tuple])
    weights, std, er, sharpe = find_tangency_portfolio(combination_cov_df, combination_expected_returns, real_rf_rate)
    rating = sharpe * compute_distance_sum(combination_tuple, distance_matrix)

    if rows_filled < top_n:
        top_combinations_array[rows_filled, :best_k] = combination_tuple
        top_combinations_array[rows_filled, best_k:best_k*2] = weights
        top_combinations_array[rows_filled, best_k*2: num_metrics] = [std, er, sharpe, rating]
        rows_filled += 1
        if rows_filled == top_n:
            top_combinations_array = sort_top_combinations(top_combinations_array, -1)

    else:
        if rating > top_combinations_array[-1, -1]:
            top_combinations_array[rows_filled-1, :best_k] = combination_tuple
            top_combinations_array[rows_filled-1, best_k:best_k*2] = weights
            top_combinations_array[rows_filled-1, best_k*2: num_metrics] = [std, er, sharpe, rating]
            top_combinations_array = sort_top_combinations(top_combinations_array, -1)


In [None]:
# Remove NaN rows before further processing
top_combinations_array_cleaned = top_combinations_array[~np.isnan(top_combinations_array[:, best_k])]

print("Top", top_n, "Combinations by Sharpe Ratio:")
for row in top_combinations_array_cleaned:
    combination_indices = row[:best_k].astype(int)
    asset_symbols = [index2symbol[index] for index in combination_indices]
    # asset_symbols = [index for index in combination_indices]
    weights, std, er, sharpe, rating = row[best_k:best_k+best_k], row[best_k+best_k], row[best_k+best_k+1], row[best_k+best_k+2], row[-1]
    print(f"Assets: {asset_symbols}, Weights: {weights}, Std Dev: {std:.4f}, Expected Return: {er:.4f}, Sharpe Ratio: {sharpe:.4f}, Rating: {rating:.4f}")