In [None]:
import os
from ib_async import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
import dcor
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict
import itertools
from time import sleep
import csv
from scipy.optimize import minimize
from fredapi import Fred
import pandas_datareader.data as web
import math
import re
import ast
import traceback


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
# Prep functions
def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

In [None]:
# kind = 'midpoint'
kind = 'trades'
# kind = 'indices'

if kind == 'midpoint':
    root = 'data/daily-midpoint/'
elif kind == 'trades':
    root = 'data/daily-trades/'
elif kind == 'indices':
    root = 'data/indices/'

data_path = root + 'series/'
verified_path = root + 'verified_files.csv'

if kind in ['trades', 'indices']:
    price_col = 'average'
else:
    price_col = 'close'

In [None]:
# Verify files
fund_df = load('data/fundamentals.csv')

try:
    verified_df = pd.read_csv(verified_path)
except FileNotFoundError:
    util.startLoop()
    ib = IB()
    ib.connect('127.0.0.1', 7497, clientId=2)

    file_list = os.listdir(data_path)
    verified_files = []

    for file_name in tqdm(file_list, total=len(file_list), desc="Verifying files"):
        if not file_name.endswith('.csv'):
            continue
        try:
            symbol, exchange, currency = file_name.replace('.csv', '').split('-')
            symbol_data = fund_df[(fund_df['symbol'] == symbol) & (fund_df['currency'] == currency)]
            if symbol_data.empty:
                continue

            contract_details = ib.reqContractDetails(Stock(symbol, exchange, currency))
            if not contract_details:
                continue
            isin = contract_details[0].secIdList[0].value

            if symbol_data['isin'].iloc[0] != isin:
                continue

            instrument_name = symbol_data['longName'].iloc[0].replace('-', '').replace('+', '')
            leveraged = any(
                re.fullmatch(r'\d+X', word) and int(word[:-1]) > 1 or word.lower().startswith(('lv', 'lev'))
                for word in instrument_name.split()
            )
            if leveraged:
                continue

            verified_files.append({'symbol': symbol, 'currency': currency})
        except ValueError as e:
            print(f"Invalid filename format {file_name}: {e}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    verified_df = pd.DataFrame(verified_files)
    verified_df.to_csv(verified_path, index=False)

    ib.disconnect()

### Merge historical series with fundamentals

In [None]:
# def ensure_series_types(df, price_col):
#     # Ensure dates and numeric types
#     df['date'] = pd.to_datetime(df['date'])
#     df = df.sort_values('date').reset_index(drop=True)
#     for col in ['volume', price_col]:
#         df[col] = pd.to_numeric(df[col], errors='coerce')

#     return df

# def local_pattern_filter(df, price_col, max_stale_days=10):
#     # Compute row-level patterns
#     df['invalid_prices'] = df[price_col] <= 0
#     df['inconsistent'] = (df['low'] > df['high'])# | (df[price_col] < df['low']) | (df[price_col] > df['high'])
#     df['stale_price'] = df[price_col].rolling(window=max_stale_days).std() == 0
#     df['stale_volume'] = df['volume'].rolling(window=max_stale_days).std() == 0
    
#     # Compute aggregate flags
#     invalid_prices_flag = df['invalid_prices'].any()
#     inconsistent_flag = df['inconsistent'].any()
#     stale_price_flag = df['stale_price'].any()
#     stale_volume_flag = df['stale_volume'].any()
    
#     return df, invalid_prices_flag, stale_price_flag, stale_volume_flag, inconsistent_flag


# # def check_consistency(df, window=14, z=3):
# #     # Basic consistency checks (NOT NECESSARY FOR TRADES DATA)
# #     df['negatives'] = (df[price_col] < 0) | (df['volume'] < 0) # | (df['low'] < 0) | (df['high'] < 0)
# #     df['inconsistent'] = (df['low'] > df['high']) | (df[price_col] < df['low']) | (df[price_col] > df['high'])
    
# #     # Outlier detection
# #     df['total_outlier'] = (df[price_col] > (df[price_col].median() + df[price_col].std() * z)) | (df[price_col] < (df[price_col].median() - df[price_col].std() * z)) 
    
# #     rolling_median = df[price_col].rolling(window=window, center=True, min_periods=1).mean()
# #     rolling_std = df[price_col].rolling(window=window, center=True, min_periods=1).std()
# #     std_threshold = z * rolling_std
# #     df['std_outlier'] = np.abs(df[price_col] - rolling_median) > std_threshold

# #     rolling_mad = df[price_col].rolling(window=window, center=True, min_periods=1).apply(lambda x: np.mean(np.abs(x - np.median(x))), raw=True)
# #     mad_threshold = z * rolling_mad
# #     df['mad_outlier'] = np.abs(df[price_col] - rolling_median) > mad_threshold
    
# #     rolling_iqr = df[price_col].rolling(window=window, center=True, min_periods=1).apply(lambda x: np.subtract(*np.percentile(x, [75, 25])), raw=True)
# #     iqr_threshold = z * rolling_iqr
# #     df['iqr_outlier'] = np.abs(df[price_col] - rolling_median) > iqr_threshold

# #     df['all'] = df['std_outlier'] & df['mad_outlier'] & df['iqr_outlier'] | df['negatives']

# #     return df, df['negatives'].sum(), df['inconsistent'].sum(), df['total_outlier'].sum(), df['std_outlier'].sum(), df['mad_outlier'].sum(), df['iqr_outlier'].sum(), df['all'].sum()

In [None]:
def ensure_series_types(df, price_col):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    for col in ['volume', price_col]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def validate_raw_prices(df, price_col):
    invalid_price_mask = df[price_col] <= 0
    inconsistent_mask = pd.Series(False, index=df.index)
    if 'low' in df.columns and 'high' in df.columns:
        inconsistent_mask = (df['low'] > df['high'])

    local_error_mask = invalid_price_mask | inconsistent_mask
    df = df[~local_error_mask].copy()
    
    return df

def handle_stale_periods(df, price_col, max_stale_days=5):
    stale_groups = (df[price_col].diff() != 0).cumsum()
    if stale_groups.empty:
        return df
    
    period_lengths = df.groupby(stale_groups)[price_col].transform('size')
    long_stale_mask = period_lengths > max_stale_days
    
    is_intermediate_stale_row = (stale_groups.duplicated(keep='first') & 
                             stale_groups.duplicated(keep='last'))
    
    rows_to_drop_mask = long_stale_mask & is_intermediate_stale_row
    df = df[~rows_to_drop_mask].copy()
        
    return df

In [None]:
# def adjust_for_splits(df, price_col, base_detect_tolerance=0.02, base_check_tolerance=0.15, window=3, volatility_mad_threshold=3.0, volatility_influence=0.1):
#     split_ratios_map = {
#         # Splits
#         -0.50: 2,
#         -0.666: 3,
#         -0.75: 4,
#         -0.80: 5,
#         # Reverse Splits
#         1.0: 0.5,
#         2.0: 1/3,
#         3.0: 1/4,
#         0.5: 2/3,
#     #    -0.333: 3/2
#     }
    
#     df = df.sort_values('date').reset_index(drop=True)
#     df['pct_change'] = df[price_col].pct_change()

#     all_pct_changes = df['pct_change'].dropna()
#     pct_change_median = all_pct_changes.median()
#     pct_change_mad = (all_pct_changes - pct_change_median).abs().median()
#     normal_range_lower = pct_change_median - volatility_mad_threshold * pct_change_mad
#     normal_range_upper = pct_change_median + volatility_mad_threshold * pct_change_mad
#     split_ratios_map = {
#         ideal_pct: ratio_val
#         for ideal_pct, ratio_val in split_ratios_map.items()
#         if not (normal_range_lower <= ideal_pct <= normal_range_upper)
#     }

#     detect_tolerance = base_detect_tolerance + (volatility_influence * pct_change_mad)
#     check_tolerance = base_check_tolerance + (volatility_influence * pct_change_mad)

#     for idx in range(1, len(df)):
#         actual_pct_change = df.loc[idx, 'pct_change']
#         if pd.isna(actual_pct_change):
#             continue

#         matched_ratio = None
#         for ideal_pct, ratio_val in split_ratios_map.items():
#             if np.isclose(actual_pct_change, ideal_pct, atol=detect_tolerance):
#                 matched_ratio = ratio_val
#                 break
        
#         if matched_ratio is not None:
#             plot_df(df)
#             price_at_split = df.loc[idx, price_col]
#             if idx + window < len(df):
#                 prices_after_event = df.loc[idx + 1 : idx + window, price_col]
#                 if prices_after_event.count() >= max(1, window - 1):
#                     median_price_after = prices_after_event.median()
#                     if np.isclose(median_price_after, price_at_split, rtol=check_tolerance, atol=price_at_split*check_tolerance):
#                         df.loc[:idx-1, price_col] /= matched_ratio
#                         if 'volume' in df.columns:
#                              df.loc[:idx-1, 'volume'] *= matched_ratio
#             plot_df(df)

#     df['pct_change'] = df[price_col].pct_change()
#     return df

# def plot_df(df):
#     plt.figure(figsize=(10, 6))
#     plt.plot(df['date'], df[price_col], marker='o')
#     plt.xlim(df['date'].min(), df['date'].max())
#     plt.ylim(0, df[price_col].max()*1.1)
#     plt.show()

In [None]:
# Load historical series
latest = (datetime.now() - timedelta(days=365 * 6))
meta = []
file_list = os.listdir(data_path)
for file in tqdm(file_list, total=len(file_list)):
    if not file.endswith('.csv'):
        continue
    
    parts = os.path.splitext(file)[0].split('-')
    symbol, exchange, currency = parts[0], parts[1], parts[2]
    if not ((verified_df['symbol'] == symbol) & (verified_df['currency'] == currency)).any():
        continue
    
    # Load and clean raw series
    # try:
    df = load(data_path + file)
    df = ensure_series_types(df, price_col)
    df = validate_raw_prices(df, price_col)
    df = handle_stale_periods(df, price_col)
    # df = adjust_for_splits(df, price_col)

    df['pct_change'] = df[price_col].pct_change()
    if df['date'].max() > latest:
        latest = df['date'].max()

    meta.append({
        'symbol': symbol,
        'currency': currency,
        'exchange_api': exchange,
        'df': df[['date', price_col, 'volume', 'pct_change']],
    })
    # except Exception as e:
    #     print(f"ERROR {file}: {e}")
        
meta = pd.DataFrame(meta)
copied = meta.copy()
copied['df'] = copied['df'].apply(lambda x: x.copy()) 

In [None]:
# RESET
meta = copied.copy()
meta['df'] = copied['df'].apply(lambda x: x.copy()) 

In [None]:
# Calculate series gap stats
oldest = latest - pd.Timedelta(days=365 * 6)
business_days = pd.date_range(start=oldest, end=latest, freq='B')

# Calculate statistics for each DataFrame in meta
for idx, row in tqdm(meta.iterrows(), total=len(meta)):
    df = row['df']
    merged = pd.merge(pd.DataFrame({'date': business_days}), df, on='date', how='left')
    
    # Calculate gaps
    present = merged[price_col].notna()
    present_idx = np.flatnonzero(present)
    gaps = []
    length = len(merged)

    if present_idx.size > 0:
        if present_idx[0] > 0:
            gaps.append(present_idx[0])
        if present_idx.size > 1:
            internal_gaps = np.diff(present_idx) - 1
            gaps.extend(gap for gap in internal_gaps if gap > 0)
        if present_idx[-1] < length - 1:
            gaps.append(length - 1 - present_idx[-1])
    else:
        gaps = [length]

    gaps = np.array(gaps, dtype=int)
    gaps = gaps[gaps > 0]
    max_gap = float(gaps.max()) if gaps.size > 0 else 0.0
    std_gap = float(gaps.std()) if gaps.size > 0 else 0.0
    missing = length - present.sum()
    pct_missing = missing / length

    # Update meta with statistics
    meta.at[idx, 'df'] = merged
    meta.at[idx, 'max_gap'] = max_gap
    meta.at[idx, 'missing'] = missing
    meta.at[idx, 'pct_missing'] = pct_missing

print(f'Latest: {latest}')
print(f'Oldest: {oldest}')

In [None]:
def detect_and_remove_global_outliers(meta_df, price_col, z_threshold=120.0):
    all_pct_changes = pd.concat(
        [row['df']['pct_change'] for _, row in meta_df.iterrows()],
        ignore_index=True
    )
    all_pct_changes.dropna(inplace=True)
    all_pct_changes = all_pct_changes[~np.isinf(all_pct_changes) & (all_pct_changes != 0)]

    global_median_return = all_pct_changes.median()
    global_mad = (all_pct_changes - global_median_return).abs().median()

    outlier_series = {}
    for idx, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
        df = row['df']
        if df['pct_change'].isnull().all():
            continue

        absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
        outlier_mask = absolute_modified_z > z_threshold
        
        if outlier_mask.any():
            data_dict = absolute_modified_z[outlier_mask].describe()

            candidate_indices = df.index[outlier_mask]
            for df_idx in candidate_indices:
                price_to_check_idx = df_idx - 1
                price_to_check = df.loc[price_to_check_idx, price_col]

                window = 10
                local_window_start = max(0, price_to_check_idx - window)
                local_window = df.loc[local_window_start : price_to_check_idx - 1, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std == 0: 
                    continue

                price_z_score = abs(price_to_check - local_mean) / local_std
                cols_to_null = [price_col, 'volume', 'high', 'low', 'pct_change']
                cols_to_null = [c for c in cols_to_null if c in df.columns]
                if price_z_score > z_threshold / 10:
                    df.loc[price_to_check_idx, cols_to_null] = np.nan

                    price_to_check = df.loc[df_idx, price_col]

                    local_window_end = min(df_idx + window, df.index[outlier_mask].max())
                    if df_idx == df.index[outlier_mask].max():
                        df.loc[df_idx, cols_to_null] = np.nan
                    else:
                        local_window = df.loc[df_idx + 1: local_window_end, price_col].dropna()
                        local_mean = local_window.mean()
                        local_std = local_window.std()
                        if local_std == 0:
                            continue
                        price_z_score = abs(price_to_check - local_mean) / local_std
                        if price_z_score > z_threshold / 10:
                            df.loc[df_idx, cols_to_null] = np.nan
                else:
                    df.loc[df_idx, cols_to_null] = np.nan

            data_dict['new_length'] = len(df)
            outlier_series[row['symbol']] = data_dict
            
            df['pct_change'] = df[price_col].pct_change(fill_method=None)
            
            meta_df.at[idx, 'df'] = df

    return outlier_series

z_threshold = 120

modified_series_info = detect_and_remove_global_outliers(meta, price_col=price_col)
display(pd.DataFrame(modified_series_info).T)

# if modified_series_info:
#     modified_symbols_df = pd.DataFrame(modified_series_info).T
#     print(f"Modified {len(modified_symbols_df)} series by removing global outliers.")
# else:
#     print("No global outliers were found or removed.")

In [None]:
df.index[outlier_mask].max()

In [None]:
def global_return_filter(meta_df, z_threshold=120.0):
    all_pct_changes = pd.concat(
        [row['df']['pct_change'] for _, row in meta_df.iterrows()],
        ignore_index=True
    )
    all_pct_changes.dropna(inplace=True)
    all_pct_changes = all_pct_changes[~np.isinf(all_pct_changes) & (all_pct_changes != 0)]

    global_median_return = all_pct_changes.median()
    global_mad = (all_pct_changes - global_median_return).abs().median()

    outlier_series = {}
    for _, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
        df = row['df']        
        absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
        if absolute_modified_z.max() > z_threshold:
            outlier_series[row['symbol']] = absolute_modified_z.describe()

    return outlier_series, global_mad, global_median_return


z_threshold = 120

globally_defective_symbols, global_mad, global_median_return = global_return_filter(meta, z_threshold=z_threshold)
globally_defective_symbols = pd.DataFrame(globally_defective_symbols)
# display(meta[meta['symbol'].isin(globally_defective_symbols)])

meta_indexed = meta.set_index('symbol')
for symbol in globally_defective_symbols.T.sort_values(by='max', ascending=True).index.tolist():
    df = meta_indexed.loc[symbol, 'df'].copy()
    df = df.reset_index(drop=True)

    absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
    outlier_mask = absolute_modified_z > z_threshold

    corrected_outlier_mask = pd.Series(False, index=df.index)
    for df_idx in df.index[outlier_mask]:
        price_to_check_idx = df_idx - 1

        price_to_check = df.at[price_to_check_idx, price_col]

        window = 10
        local_window_start = max(0, price_to_check_idx - window)
        local_window = df.loc[local_window_start : price_to_check_idx - 1, price_col].dropna()
        local_mean = local_window.mean()
        local_std = local_window.std()
        if local_std == 0:
            continue
        price_z_score = abs(price_to_check - local_mean) / local_std

        if price_z_score > z_threshold / 10:
            corrected_outlier_mask.at[price_to_check_idx] = True

            price_to_check = df.at[df_idx, price_col]

            local_window_end = min(df_idx + window, df.index[outlier_mask].max())
            if df_idx == df.index[outlier_mask].max():
                corrected_outlier_mask.at[df_idx] = True
            else:
                local_window = df.loc[df_idx + 1: local_window_end, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std == 0:
                    continue
                price_z_score = abs(price_to_check - local_mean) / local_std
                if price_z_score > z_threshold / 10:
                    corrected_outlier_mask.at[df_idx] = True
        else:
            corrected_outlier_mask.at[df_idx] = True

    if corrected_outlier_mask.any():
        # Plotting
        plt.figure(figsize=(10, 6))
        plt.plot(df['date'], df[price_col], marker='o', label='Normal')
        plt.scatter(df.loc[corrected_outlier_mask, 'date'],
                    df.loc[corrected_outlier_mask, price_col],
                    color='red', label='Outlier', zorder=5)

        plt.title(f"Symbol: {symbol}")
        plt.xlabel("Date")
        plt.ylabel(price_col)
        plt.legend()
        plt.tight_layout()
        plt.show()


In [None]:
# Remove large gap series and merge with fund
meta['max_gap_log'] = np.log1p(meta['max_gap'])
meta['max_gap_log'] = meta['max_gap_log'] / meta['max_gap_log'].max()
meta['exclusion_score'] = meta['pct_missing'] + meta['max_gap_log']

condition = ((meta['max_gap_log'] < meta['max_gap_log'].mean()) & 
             (meta['pct_missing'] < meta['pct_missing'].mean()))
filtered = meta[condition].sort_values(by='exclusion_score', ascending=False).copy()

# Interpolate/extrapolate price column
for idx, row in tqdm(filtered.iterrows(), total=len(filtered)):
    df = row['df']
    df[price_col] = df[price_col].interpolate(method='akima', limit_direction='both')
    if df[price_col].isna().any():
        df[price_col] = df[price_col].ffill()
        df[price_col] = df[price_col].bfill()
    
    df['pct_change'] = df[price_col].pct_change()
    filtered.at[idx, 'df'] = df

filtered = pd.merge(filtered, fund_df, on=['symbol', 'currency'], how='inner').drop(['max_gap', 'missing', 'pct_missing', 'exclusion_score', 'max_gap_log'], axis=1)

In [None]:
# Manual plot series
# df = meta.iloc[0]['df'].copy()

# con = 252505259.0
# con = failed[16]
symbol = 'EL4A'

# df = filtered[filtered['conId'] == con]['df'].iloc[0]
df = meta[meta['symbol'] == symbol]['df'].iloc[0]
# df = copied[copied['symbol'] == symbol]['df'].iloc[0]
# display(df)

# Step 5: Forward fill missing values (optional, adjust as needed)
# df[price_col] = df[price_col].fillna(0)
plot_df(df)
df['outlier'] = ((df['pct_change'] - global_median_return).abs() / global_mad) > z_threshold
df

In [None]:
# # Delete duplicates
# duplicates = meta[meta.duplicated(subset=['symbol', 'currency'], keep=False)].copy()
# duplicates['not_smart'] = duplicates['exchange_api'] != 'SMART'

# sorted_duplicates = duplicates.sort_values(
#     by=['symbol', 'currency', 'length', 'not_smart'],
#     ascending=[True, True, False, False]
# )

# rows_to_keep = sorted_duplicates.groupby(['symbol', 'currency']).head(1)
# rows_to_delete = duplicates[~duplicates.index.isin(rows_to_keep.index)]
# for idx, row in rows_to_delete.iterrows():
#     file_name = f"{row['symbol']}-{row['exchange_api']}-{row['currency']}.csv"
#     file_path = os.path.join(data_path, file_name)
#     if os.path.exists(file_path):
#         os.remove(file_path)
#         print(f"Deleted {file_path}")
#     else:
#         print(f"File not found: {file_path}")

# del duplicates, sorted_duplicates, rows_to_keep, rows_to_delete

# Plot asset class portfolios

In [None]:
# Risk-free series calculation
import pandas as pd
import pandas_datareader.data as web

# 3-month bill/interest rate tickers (FRED/OECD) for each country
tickers = {
    'US': 'DTB3',
    'Canada': 'IR3TIB01CAM156N',
    'Germany': 'IR3TIB01DEM156N',
    'UK': 'IR3TIB01GBM156N',
    'France': 'IR3TIB01FRA156N',

    # 'Australia': 'IR3TIB01AUS156N',
    # 'Denmark': 'IR3TIB01DNK156N',
    # 'Netherlands': 'IR3TIB01NLD156N',
    # 'Norway': 'IR3TIB01NOR156N',
    # 'Singapore': 'IR3TIB01SGP156N',
    # 'Sweden': 'IR3TIB01SWE156N',
    # 'Switzerland': 'IR3TIB01CHE156N',
    # 'New Zealand': 'IR3TIB01NZL156N',
    # 'Belgium': 'IR3TIB01BEL156N',
    # 'Finland': 'IR3TIB01FIN156N',
    # 'Austria': 'IR3TIB01AUT156N',
    # 'Ireland': 'IR3TIB01IRL156N',
    # 'Hong Kong': 'IR3TIB01HKG156N',
    # 'South Korea': 'IR3TIB01KOR156N',
    # 'Qatar': 'IR3TIB01QAT156N',
    # 'UAE': 'IR3TIB01ARE156N',
    # 'Taiwan': 'IR3TIB01TWN156N',
    # 'Czech Republic': 'IR3TIB01CZE156N',
}

# Fetch each series and convert from percentage to decimal
bonds = {}
failed = []
for country, ticker in tickers.items():
    try:
        series = web.DataReader(ticker, 'fred', oldest, latest)
        bonds[country] = series / 100.0
    except Exception:
        try:
            series = web.DataReader(ticker, 'oecd', oldest, latest)
            bonds[country] = series / 100.0
        except Exception as oecd_err:
            failed.append(country)

# Combine into a single DataFrame
df_bonds = pd.concat(bonds, axis=1)
df_bonds.columns = [c for c in tickers if c not in failed]
df_bonds = df_bonds.interpolate(method='akima').bfill().ffill()

fred = Fred(api_key='30ae0e4e7713662116edf836cec71562')
cpi_data = fred.get_series('CPIAUCSL', oldest, latest)
risk_free_df = pd.concat([df_bonds.mean(axis=1).rename('nominal_rate'), cpi_data.rename('cpi')], axis=1)

# Match with the other price series
risk_free_df = risk_free_df.reindex(business_days, copy=False)
risk_free_df = risk_free_df.interpolate(method='akima').bfill().ffill()

risk_free_df['inflation_rate'] = risk_free_df['cpi'].pct_change()
risk_free_df['daily_nominal_rate'] = risk_free_df['nominal_rate'] / 252
risk_free_df['real_rate'] = (1 + risk_free_df['daily_nominal_rate']) / (1 + risk_free_df['inflation_rate']) - 1

In [None]:
# Add pct_change cols to dfs + remove uninformative cols for market portfolios 
cols_to_exclude = ['conId']
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

pct_changes = pd.concat(
        [row['df'].set_index('date')['pct_change'].rename(row['conId']) 
        for _, row in filtered.iterrows()], axis=1
    )
uninformative_cols = [col for col in numerical_cols if filtered[col].nunique(dropna=True) <= 1]
filtered = filtered.drop(columns=uninformative_cols)
filtered = filtered.dropna(axis=1, how='all')

In [None]:
# Add rate of change fundamentals
def calculate_slope(value1, value2, time1, time2):
    return (value1 - value2) / (time1 - time2)


rate_fundamentals = [('EPSGrowth-1yr', 'EPS_growth_3yr', 'EPS_growth_5yr'),
                     ('ReturnonAssets1Yr', 'ReturnonAssets3Yr'),
                     ('ReturnonCapital', 'ReturnonCapital3Yr'),
                     ('ReturnonEquity1Yr', 'ReturnonEquity3Yr'),
                     ('ReturnonInvestment1Yr', 'ReturnonInvestment3Yr')]

for cols in rate_fundamentals:
    base_name = cols[0].replace('-1yr', '').replace('1Yr', '')
    slope_col = f'fundamentals_{base_name}_slope'
    
    if len(cols) == 3:
        col_1yr, col_3yr, col_5yr = cols

        filtered[slope_col] = calculate_slope(
            filtered[f'fundamentals_{col_1yr}'],
            filtered[f'fundamentals_{col_5yr}'],
            1, 5
        )

        if 'EPS' in base_name:
            slope_1yr_3yr = calculate_slope(
                filtered[f'fundamentals_{col_1yr}'],
                filtered[f'fundamentals_{col_3yr}'],
                1, 3
            )
            slope_3yr_5yr = calculate_slope(
                filtered[f'fundamentals_{col_3yr}'],
                filtered[f'fundamentals_{col_5yr}'],
                3, 5
            )
            
            second_deriv_col = f'fundamentals_{base_name}_second_deriv'
            filtered[second_deriv_col] = calculate_slope(
                slope_1yr_3yr,
                slope_3yr_5yr,
                1, 3
            )
    elif len(cols) == 2:
        col_1yr, col_3yr = cols
        filtered[slope_col] = calculate_slope(
            filtered[f'fundamentals_{col_1yr}'],
            filtered[f'fundamentals_{col_3yr}'],
            1, 3
        )

# Add new cols to numericals
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

In [None]:
def get_return_stats(price_df: pd.DataFrame) -> pd.Series:
    if price_df is None or 'pct_change' not in price_df.columns:
        er = std = sharpe = np.nan
    else:
        # Align risk-free rates with price_df's date index
        rf_series = risk_free_df.set_index('date')['real_rate'].reindex(price_df.index)

        excess_returns = price_df['pct_change'] - rf_series
        sharpe = excess_returns.mean() / excess_returns.std() if excess_returns.std() != 0 else np.nan
        
        er = price_df['pct_change'].mean()
        std = price_df['pct_change'].std()
        avg_volume = price_df['volume'].mean()

    return pd.Series([er, std, sharpe, avg_volume], index=['er', 'std', 'sharpe', 'avg_volume'])
    
filtered[['er', 'std', 'sharpe', 'avg_volume']] = filtered['df'].apply(get_return_stats)

In [None]:
# Create all asset type portfolios
import matplotlib.pyplot as plt

holding_cols = [col for col in filtered.columns if col.startswith('holding_') and col != 'holding_types_variety'] + ['total']#, 'risk-free']
portfolio_dfs = {}

for holding_col in holding_cols:
    name = holding_col.split('_')[-1]
    if holding_col == 'total':
        weight = filtered['profile_cap_usd']
    # elif holding_col == 'risk-free':
    #     weight = (filtered['profile_cap_usd'] * 
    #               filtered['holding_types_bond'] * 
    #               filtered[['debt_type_%Quality/AAA', 'debt_type_%Quality/AA']].sum(axis=1) * 
    #               filtered['maturity_%MaturityLessthan1Year'] * 
    #               np.log1p(filtered['avg_volume']))
    else:
        weight = (filtered['profile_cap_usd'] * filtered[holding_col])
 
    total_market_cap = (weight).sum()
    filtered['weight'] = weight / total_market_cap
    
    weights = filtered.set_index('conId')['weight']
    portfolio_return = pct_changes.dot(weights)
    initial_price = 1
    portfolio_price = initial_price * (1 + portfolio_return.fillna(0)).cumprod()

    portfolio_df = pd.DataFrame({
        'date': portfolio_price.index,
        price_col: portfolio_price.values,
        'pct_change': portfolio_return.values
    }).set_index('date')

    portfolio_dfs[name] = portfolio_df

    print(f"{holding_col}: {format(total_market_cap, ',.2f')}")
    plt.figure(figsize=(10, 6))
    plt.title(f'{name} Portfolio')
    plt.plot(portfolio_df.index, portfolio_df[price_col], marker='o')
    plt.show()

filtered.drop('weight', axis=1, inplace=True)

In [None]:
# Manual plot
symbol_test = 'SHV'
x = filtered[filtered['symbol'] == symbol_test].df.iloc[0]['date']
y = filtered[filtered['symbol'] == symbol_test].df.iloc[0]['average']#.pct_change()

y = risk_free_df['daily_nominal_rate']
x = risk_free_df.index
# y = df_bonds['UK']
# x = df_bonds.index
y = portfolio_dfs['equity']['pct_change'] - risk_free_df['daily_nominal_rate']
x = portfolio_dfs['equity'].index

plt.figure(figsize=(10, 6))
plt.plot(x, y, marker='o')
# plt.xlim(market_portfolio_df['date'].min(), market_portfolio_df['date'].max())
plt.show()

In [None]:
# Avoid dummy trap
empty_subcategories = {
'holding_types': ['other'],
'countries': ['Unidentified'], 
'currencies': ['<NoCurrency>'],
'industries': ['NonClassifiedEquity', 'NotClassified-NonEquity'],
'top10': ['OtherAssets', 'AccountsPayable','AccountsReceivable','AccountsReceivable&Pay','AdministrationFees','CustodyFees','ManagementFees','OtherAssetsandLiabilities','OtherAssetslessLiabilities', 'OtherFees','OtherLiabilities','Tax','Tax--ManagementFees'],
'debtors': ['OTHER'],
'maturity': ['%MaturityOther'],
'debt_type': ['%QualityNotAvailable', '%QualityNotRated'],
'manual': ['other']
}

dummy_trap_cols = []
for k, lst in empty_subcategories.items():
    for i in lst:
        if k == 'manual':
            dummy_trap_cols.append(i)
        else:
            dummy_trap_cols.append(f'{k}_{i}')
    
filtered = filtered.drop(columns=dummy_trap_cols, axis=1, errors='ignore')

# Regression analysis

In [None]:
# Factor construction function
import statsmodels.api as sm

def construct_long_short_factor_returns(full_meta_df, returns_df, long_symbols, short_symbols):
    long_df = full_meta_df[full_meta_df['conId'].isin(long_symbols)]
    long_weights = long_df.set_index('conId')['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
    if long_weights.sum() > 0:
        long_weights /= long_weights.sum()
        long_returns = returns_df.dot(long_weights)
    else:
        long_returns = pd.Series(0, index=returns_df.index)
    
    short_df = full_meta_df[full_meta_df['conId'].isin(short_symbols)]
    short_weights = short_df.set_index('conId')['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
    if short_weights.sum() > 0:
        short_weights /= short_weights.sum()
        short_returns = returns_df.dot(short_weights)
    else:
        short_returns = pd.Series(0, index=returns_df.index)
    
    factor_returns = long_returns - short_returns
    return factor_returns


factors = {}

# SMB_ETF
small_symbols = filtered[filtered['small'] == 1]['conId'].tolist()
large_symbols = filtered[filtered['large'] == 1]['conId'].tolist()

intersection = set(small_symbols) & set(large_symbols)
small_symbols = [s for s in small_symbols if s not in intersection]
large_symbols = [s for s in large_symbols if s not in intersection]
smb_etf = construct_long_short_factor_returns(filtered, pct_changes, small_symbols, large_symbols)
factors['small'] = smb_etf

# HML_ETF
value_cols = [col for col in filtered.columns if col.startswith('style_') and col.endswith('value')]
growth_cols = [col for col in filtered.columns if col.startswith('style_') and col.endswith('growth')]

value_symbols = filtered[filtered[value_cols].ne(0).any(axis=1)]['conId'].tolist()
growth_symbols = filtered[filtered[growth_cols].ne(0).any(axis=1)]['conId'].tolist()

intersection = set(value_symbols) & set(growth_symbols)
value_symbols = [s for s in value_symbols if s not in intersection]
growth_symbols = [s for s in growth_symbols if s not in intersection]
hml_etf = construct_long_short_factor_returns(filtered, pct_changes, value_symbols, growth_symbols)
factors['value'] = hml_etf

# variety_columns
variety_columns = [c for c in filtered.columns if c.endswith('variety')]
for variety_col in variety_columns:
    large_var_symbols = filtered[filtered[variety_col] < filtered[variety_col].median()]['conId'].tolist()
    small_var_symbols = filtered[filtered[variety_col] > filtered[variety_col].median()]['conId'].tolist()

    var_etf = construct_long_short_factor_returns(filtered, pct_changes, large_var_symbols, small_var_symbols)
    factors[variety_col] = var_etf

# non-variety columns
non_variety = [c for c in numerical_cols if c not in variety_columns]
for non_var_col in non_variety:
    large_var_symbols = filtered[filtered[non_var_col] < filtered[non_var_col].mean()]['conId'].tolist()
    small_var_symbols = filtered[filtered[non_var_col] > filtered[non_var_col].mean()]['conId'].tolist()

    var_etf = construct_long_short_factor_returns(filtered, pct_changes, large_var_symbols, small_var_symbols)
    factors[non_var_col] = var_etf


# market risk premium series and combine
factors['market'] = (portfolio_dfs['equity']['pct_change'] - risk_free_df['daily_nominal_rate'])
factors_df = pd.DataFrame(factors)

# Run regressions
results = []
for symbol in tqdm(pct_changes.columns, desc="Running Regressions"):
    etf_excess = pct_changes[symbol] - risk_free_df['daily_nominal_rate']
    data = pd.concat([pct_changes[symbol], factors_df], axis=1).dropna()

    Y = data.iloc[:, 0]
    X = sm.add_constant(data.iloc[:, 1:])
    model = sm.OLS(Y, X).fit()
    result = {
        'conId': symbol,
        'r_squared': model.rsquared,
        'alpha': model.params['const'],
        'alpha_pval': model.pvalues['const'],
    }
    for factor in factors_df.columns:
        result[f'beta_{factor}'] = model.params[factor]
        result[f'pval_beta_{factor}'] = model.pvalues[factor]
    results.append(result)

results_df = pd.DataFrame(results)
results_df

In [None]:
filtered['tradable'].median()

In [None]:
print([col for col in filtered if col not in numerical_cols])
non_variety = [c for c in numerical_cols if c not in variety_columns]
filtered[non_variety]

In [None]:
# Manual plot series
con_id = 118971226.0
df = filtered[filtered['conId'] == con_id]['df'].iloc[0].copy()
display(filtered[filtered['conId'] == con_id])

# Step 5: Forward fill missing values (optional, adjust as needed)
# df[price_col] = df[price_col].fillna(0)

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df[price_col], marker='o')
plt.xlim(df['date'].min(), df['date'].max())
plt.ylim(0, df[price_col].max()*1.1)
plt.show()

## Old brownian interpolation

In [None]:
# import numpy as np

# def brownian_bridge(t, t0, t1, x0, x1, sigma):
#     """Generate points using Brownian bridge between (t0, x0) and (t1, x1)."""
#     dt = t1 - t0
#     mu = x0 + (x1 - x0) * (t - t0) / dt  # Linear interpolation for mean
#     variance = sigma**2 * (t1 - t) * (t - t0) / dt
#     return mu + np.random.normal(0, np.sqrt(variance))

# # Example
# t = np.linspace(0, 10, 11)  # Original time points
# prices = np.random.normal(100, 5, len(t))  # Simulated price series
# sigma = np.std(prices)  # Variance of the series

# # Interpolate to finer grid
# t_new = np.linspace(0, 10, 21)  # New time points
# prices_new = np.zeros(len(t_new))

# # Copy original points and interpolate gaps
# for i in range(len(t) - 1):
#     idx = np.where((t_new >= t[i]) & (t_new <= t[i+1]))[0]
#     for j in idx:
#         prices_new[j] = brownian_bridge(t_new[j], t[i], t[i+1], prices[i], prices[i+1], sigma)

# # Verify variance
# print("Original variance:", np.var(prices))
# print("Interpolated variance:", np.var(prices_new))

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt
# from scipy.interpolate import CubicSpline

# def brownian_bridge(t, t0, t1, x0, x1, sigma):
#     """Generate points using Brownian bridge between (t0, x0) and (t1, x1)."""
#     dt = t1 - t0
#     mu = x0 + (x1 - x0) * (t - t0) / dt  # Linear interpolation for mean
#     variance = sigma**2 * (t1 - t) * (t - t0) / dt
#     return mu + np.random.normal(0, np.sqrt(variance))

# # Generate original price series
# t = np.linspace(0, 10, 11)  # Original time points
# prices = np.random.normal(100, 5, len(t))  # Simulated price series
# sigma = np.std(prices)  # Standard deviation for Brownian bridge

# # Interpolate to finer grid
# t_new = np.linspace(0, 10, 21)  # New time points
# prices_new = np.zeros(len(t_new))  # Brownian bridge interpolation
# prices_lin = np.zeros(len(t_new))  # Linear interpolation
# prices_spl = np.zeros(len(t_new))  # Spline interpolation

# # Brownian bridge interpolation
# for i in range(len(t) - 1):
#     idx = np.where((t_new >= t[i]) & (t_new <= t[i+1]))[0]
#     for j in idx:
#         prices_new[j] = brownian_bridge(t_new[j], t[i], t[i+1], prices[i], prices[i+1], sigma)

# # Linear interpolation
# prices_lin = np.interp(t_new, t, prices)

# # Spline interpolation
# spline = CubicSpline(t, prices)
# prices_spl = spline(t_new)

# # Verify variances
# print("Original variance:", np.var(prices))
# print("Brownian bridge variance:", np.var(prices_new))
# print("Linear interpolation variance:", np.var(prices_lin))
# print("Spline interpolation variance:", np.var(prices_spl))

# # Plotting
# plt.figure(figsize=(10, 6))
# plt.plot(t, prices, 'o-', label='Original Prices', markersize=8)
# plt.plot(t_new, prices_new, 'x-', label='Brownian Bridge Interpolation')
# plt.plot(t_new, prices_lin, 's-', label='Linear Interpolation')
# plt.plot(t_new, prices_spl, 'd-', label='Spline Interpolation')
# plt.title('Price Series Interpolation Comparison')
# plt.xlabel('Time')
# plt.ylabel('Price')
# plt.grid(True)
# plt.legend()

# plt.show()

In [None]:
# # Graph correlations
# import seaborn as sns
# import matplotlib.pyplot as plt

# numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

# # drop columns with missing values
# corr_df = filtered[numerical_cols].corr()
# # corr_df.dropna(axis=1, how='all', inplace=True)
# # corr_df.dropna(axis=0, how='all', inplace=True)

# plt.figure(figsize=(50, 50))
# sns.heatmap(corr_df, cmap='coolwarm')
# plt.show()

---
### Prep historical data
---

In [None]:
# # Test day gap
# dfs = {}
# for file in os.listdir(data_path):
#     symbol = os.path.splitext(file)[0].split('-')[0]
#     if symbol in verified_files:
#         dfs[symbol] = pd.read_csv(data_path + file)

# days, nums, lens, firsts = [], [], [], []
# for day in range(5,30):
#     days.append(day)

#     melted_dfs = []
#     expected_returns = {}
#     for symbol, df in tqdm(dfs.items(), total=len(dfs), desc=f'{day}'):
#         df = melt(df)
#         df['date'] = pd.to_datetime(df['date'])

#         latest_date = df['date'].iloc[-1]
#         earliest_date = df['date'].iloc[0]
#         length_required = pd.to_datetime('2020-02-01')
#         month_ago = datetime.today() - timedelta(days=30)

#         dates = df['date'].unique()
#         date_diffs = dates[1:] - dates[:-1]
        
#         if latest_date >= month_ago and earliest_date <= length_required and not (date_diffs > pd.Timedelta(days=day)).any():
#             df['symbol'] = symbol
#             df['pct_change'] = df['value'].pct_change()
#             expected_returns[symbol] = df['pct_change'].mean()
#             melted_dfs.append(df)
#     # print(f'Loaded {len(melted_dfs)} out of {len(file_list)} series ({round(len(melted_dfs)/len(file_list)*100, 4)}%)')

#     # Concatenate and pivot data
#     returns_df = pd.concat(melted_dfs, ignore_index=True)
#     returns_df = returns_df.pivot(index=['date', 'kind'], columns='symbol', values='pct_change')
#     returns_df = returns_df.sort_values(by=['date', 'kind'], ascending=[True, False]).reset_index().dropna()
#     lens.append(len(returns_df))
#     nums.append(len(returns_df.columns))
#     firsts.append(returns_df.date.iloc[0])

# gap_data_df = pd.DataFrame({
#     'day_gap': days,
#     'num_etfs': nums,
#     'period_length': lens,
#     'first_day':firsts})

# gap_data_df

In [None]:
# Load and prepare historical training data
# def melt(data_df, value_columns=None):
#     if not value_columns:
#         value_columns = ['open', 'close']
#     id_columns = [col for col in data_df.columns.to_list() if col not in value_columns]
#     melted_df = data_df.melt(id_vars=id_columns, value_vars=value_columns, var_name='kind', value_name='value')
#     return melted_df.sort_values(by=['date'], ascending=[True, False]).reset_index(drop=True)

# Load historical data and merge them all into one df
dfs = {}
file_list = os.listdir(data_path)
for file in file_list:
    symbol = os.path.splitext(file)[0].split('-')[0]
    if symbol in verified_files:
        dfs[symbol] = pd.read_csv(data_path + file)


# Melt dfs, filters, and calc pct_change. ASSUMES that dfs are sorted chronologically
training_start_date = pd.to_datetime('2020-02-01')
month_ago = datetime.today() - timedelta(days=31)

day_gap = 6 # SET ACCEPTABLE DAY GAP

melted_dfs, expected_returns = [], {}
for symbol, df in tqdm(dfs.items(), total=len(dfs), desc=f'Filtering {kind} dfs'):
    df['date'] = pd.to_datetime(df['date'])

    latest_date = df['date'].iloc[-1]
    earliest_date = df['date'].iloc[0]
    dates = df['date'].unique()
    date_gaps = dates[1:] - dates[:-1]
    
    if (kind == 'indices') or (latest_date >= month_ago and earliest_date <= training_start_date and (date_gaps <= pd.Timedelta(days=day_gap)).all()):
        df['symbol'] = symbol
        df['pct_change'] = df['average'].pct_change()
        expected_returns[symbol] = df['pct_change'].mean()
        melted_dfs.append(df)
print(f'Loaded {len(melted_dfs)} out of {len(file_list)} series ({round(len(melted_dfs)/len(file_list)*100, 4)}%)')

# Concatenate and pivot data
returns_df = pd.concat(melted_dfs, ignore_index=True)
returns_df = returns_df.pivot(index=['date'], columns='symbol', values='pct_change')
returns_df = returns_df.sort_values(by=['date'], ascending=[True]).reset_index()

# Define training boundaries
training_cutoff_date = datetime.today() - timedelta(days=365)
training_df = returns_df[returns_df['date'] <= training_cutoff_date]
training_matrix = training_df.drop(['date'], axis=1).dropna().copy()

In [None]:
# Calculate risk-free-rate for training window
treasury_rate = web.DataReader('DGS10', 'fred', training_cutoff_date-timedelta(days=365), training_cutoff_date)
nominal_rf_rate = treasury_rate.mean() / 100

fred = Fred(api_key='30ae0e4e7713662116edf836cec71562')
cpi_data = fred.get_series('CPIAUCSL', training_cutoff_date-timedelta(days=365), training_cutoff_date) # CPI
inflation_rate = (cpi_data.iloc[-1] - cpi_data.iloc[0]) / cpi_data.iloc[0]

real_rf_rate = (1 + nominal_rf_rate) / (1 + inflation_rate) - 1

In [None]:
# Calculate corr and cov for historical training data
training_array = training_matrix.values # Convert training matrix to numpy array
symbol_list = training_matrix.columns.tolist()
num_symbols = len(symbol_list)
corr_matrix = np.zeros((num_symbols, num_symbols)) # Pre-allocate numpy array for correlation
cov_matrix = np.zeros((num_symbols, num_symbols))  # Pre-allocate numpy array for covariance

for i, sym_i in tqdm(enumerate(symbol_list), total=num_symbols, desc=f"Calculating distance stats sqr"):
    for j, sym_j in enumerate(symbol_list):
        if i <= j:  # Compute only for upper triangle (including diagonal)
            stats = dcor.distance_stats(training_array[:, i], training_array[:, j])
            corr_value = stats.correlation_xy
            cov_value = stats.covariance_xy

            corr_matrix[i, j] = corr_value
            corr_matrix[j, i] = corr_value  # Fill symmetric value

            cov_matrix[i, j] = cov_value
            cov_matrix[j, i] = cov_value  # Fill symmetric value

corr_df = pd.DataFrame(corr_matrix, index=symbol_list, columns=symbol_list) # Convert numpy array back to df for output
cov_df = pd.DataFrame(cov_matrix, index=symbol_list, columns=symbol_list)   # Convert numpy array back to df for output

corr_df.to_csv(f'{root}corr_df.csv', index=False)
cov_df.to_csv(f'{root}cov_df.csv', index=False)

---
### Compute etf combinations based on optimal k_clusters
---

In [None]:
# Load corr and cov
corr_df = pd.read_csv(f'{root}corr_df.csv')
cov_df = pd.read_csv(f'{root}cov_df.csv')
symbol_list = corr_df.columns

symbol2index = dict(zip(corr_df.columns, corr_df.index))
index2symbol = dict(zip(corr_df.index, corr_df.columns))
corr_df.rename(columns=symbol2index, inplace=True)
cov_df.rename(columns=symbol2index, inplace=True)

distance_matrix = (1 - corr_df).to_numpy()
np.fill_diagonal(distance_matrix, 0)

In [None]:
# Thresholds / cluster_num graphs
methods = ['single', 'ward', 'average', 'complete', 'weighted', 'centroid', 'median']
methods = ['ward']
for method in methods:
    linked = sch.linkage(squareform(distance_matrix), method=method)
    
    num_clusters = range(len(corr_df), 1, -1)
    thresholds = linked[:, 2]

    # inertias = []
    # for n_clusters in num_clusters:
    #     cluster_labels = fcluster(linked, t=n_clusters, criterion='maxclust')
    #     inertia = 0
    #     for cluster in np.unique(cluster_labels):
    #         members = distance_matrix.values[cluster_labels == cluster]
    #         centroid = members.mean(axis=0)  # Cluster centroid
    #         inertia += np.sum((members - centroid) ** 2)
    #     inertias.append(inertia)

    # plt.figure(figsize=(12, 6))
    # plt.plot(num_clusters, inertias, marker='o', label=f"Method {method}")
    # plt.title(f"Inertia/Num ({method})")
    # plt.xlabel('Number of Clusters')
    # plt.ylabel('Inertia (Sum of Squared Distances)')
    # plt.grid(True)
    # plt.legend()
    # plt.show()

    plt.figure(figsize=(12, 6))
    plt.plot(num_clusters, thresholds, marker='o')
    plt.title(f"Threshold/Num ({method})")
    plt.xlabel('Number of Clusters')
    plt.ylabel('Threshold (Distance)')
    plt.grid(True)
    plt.show()


In [None]:
# Silhouettes and dendrograms
def product(row):
    product = 1
    for value in row.values():
        product *= value
    return product

ks = []
scores = []
counts = []
for k in range(2, min(len(distance_matrix), 9)):
    clusters = AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(distance_matrix)
    score = silhouette_score(distance_matrix, clusters, metric='precomputed')
    ks.append(k)
    scores.append(score)
    unique_clusters, label_counts = np.unique(clusters, return_counts=True)
    label_counts_dict = dict(zip(unique_clusters, label_counts))
    counts.append(label_counts_dict)

silhouettes = pd.DataFrame({
    'k': ks,
    'score': scores,
    'counts': counts
})
silhouettes['combitions'] = silhouettes['counts'].apply(product)
silhouettes = silhouettes.sort_values(by='score', ascending=False)
best_k = silhouettes.k.iloc[0]

# best_k = 3

display(silhouettes)
methods = ['single', 'ward', 'average', 'complete', 'weighted', 'centroid', 'median']
methods = ['ward']
for method in methods:
    # Now compute the linkage using a condensed distance matrix
    linked = sch.linkage(squareform(distance_matrix), method=method)
    plt.figure(figsize=(20, 10))
    sch.dendrogram(linked, labels=corr_df.index, leaf_rotation=90)
    plt.title(f"Method {method}")
    plt.show()

---
### Calculate Minimum Variance Portfolios
---

In [None]:
# Portfolio Optimization Functions
def portfolio_variance(weights, cov_matrix):
    return weights.T @ cov_matrix @ weights

def portfolio_expected_return(weights, expected_returns_arr):
    return weights @ expected_returns_arr

def minimize_portfolio_variance(cov_matrix, expected_returns_arr):
    num_assets = len(cov_matrix)
    initial_weights = np.array([1/num_assets] * num_assets)
    bounds = tuple((0, 1) for _ in range(num_assets))
    constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})

    optimization_result = minimize(portfolio_variance,
                                    initial_weights,
                                    args=(cov_matrix,),
                                    method='SLSQP',
                                    bounds=bounds,
                                    constraints=constraints)

    if optimization_result.success:
        optimized_weights = optimization_result.x
        port_variance = optimization_result.fun
        port_std = np.sqrt(port_variance)
        port_er = portfolio_expected_return(optimized_weights, expected_returns_arr)

        return (optimized_weights, port_std, port_er)
    else:
        return (np.nan, np.nan, np.nan)

In [None]:
# Portfolio Optimization Functions
def compute_distance_sum(combination, distance_matrix):
    distance_sum = 0
    for i_idx, j_idx in itertools.combinations(combination, 2):
        distance_sum += distance_matrix[i_idx, j_idx]
    return distance_sum

def portfolio_variance(weights, cov_matrix):
    return weights.T @ cov_matrix @ weights

def portfolio_expected_return(weights, expected_returns_arr):
    return weights @ expected_returns_arr

def sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate):
    port_er = portfolio_expected_return(weights, expected_returns_arr)
    port_variance = portfolio_variance(weights, cov_matrix)
    port_std = np.sqrt(port_variance)
    return (port_er - risk_free_rate) / port_std

def negative_sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate):
    return -sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate)

def find_tangency_portfolio(cov_matrix, expected_returns_arr, risk_free_rate):
    num_assets = len(cov_matrix)
    initial_weights = np.array([1/num_assets] * num_assets)
    bounds = tuple((0, 1) for _ in range(num_assets))
    constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})

    optimization_result = minimize(negative_sharpe_ratio,
                                    initial_weights,
                                    args=(expected_returns_arr, cov_matrix, risk_free_rate),
                                    method='SLSQP',
                                    bounds=bounds,
                                    constraints=constraints)

    if optimization_result.success:
        optimized_weights = optimization_result.x
        variance = portfolio_variance(optimized_weights, cov_matrix)
        std = np.sqrt(variance)
        er = portfolio_expected_return(optimized_weights, expected_returns_arr)
        sharpe_ratio = -(optimization_result.fun)

        return (optimized_weights, std, er, sharpe_ratio)
    else:
        return (np.nan, np.nan, np.nan, np.nan)
    
def sort_top_combinations(array, sort_index):
    valid_rows = ~np.isnan(array[:, sort_index])
    valid_array = array[valid_rows]
    if valid_array.size > 0:
        sort_values = valid_array[:, sort_index]
        sort_indices = np.argsort(sort_values)[::-1]
        array[valid_rows] = valid_array[sort_indices]
    return array

In [None]:
num_symbols = len(corr_df.index)
num_metrics = best_k*2 + 4
num_combinations = math.comb(num_symbols, best_k)
combination_array = np.empty((num_combinations, num_metrics), dtype='float32')

# Calculate distance sums and populate the NumPy array
for i, combination in tqdm(enumerate(itertools.combinations(range(0,num_symbols), best_k)), total=num_combinations, desc="Calculating distance sums"):
    combination_array[i, :best_k] = combination
    combination_cov_df = cov_df.loc[combination, combination]
    combination_expected_returns = np.array([expected_returns[index2symbol[index]] for index in combination])

    index_indicator = best_k + best_k + 3
    combination_array[i, best_k: index_indicator] = find_tangency_portfolio(combination_cov_df, combination_expected_returns, real_rf_rate)
    combination_array[i, index_indicator: index_indicator + 1] = compute_distance_sum(combination, distance_matrix)

    # population growth rate



# TODO - not to be sorted by best_k
sorted_indices = np.argsort(combination_array[:, best_k], kind='mergesort')[::-1]
combination_array = combination_array[sorted_indices]
del sorted_indices
combination_array, len(combination_array)

In [None]:
num_symbols = len(corr_df.index)
num_metrics = best_k*2 + 4
num_combinations_possible = math.comb(num_symbols, best_k)

top_n = 5000 # Define how many top combinations to keep

top_combinations_array = np.empty((top_n, num_metrics), dtype='float32')
top_combinations_array[:] = np.nan
rows_filled = 0


for combination_tuple in tqdm(itertools.combinations(range(0,num_symbols), best_k), total=num_combinations_possible, desc="Calculating Tangency Portfolios"):
    combination_cov_df = cov_df.loc[combination_tuple, combination_tuple]
    combination_expected_returns = np.array([expected_returns[index2symbol[index]] for index in combination_tuple])
    weights, std, er, sharpe = find_tangency_portfolio(combination_cov_df, combination_expected_returns, real_rf_rate)
    rating = sharpe * compute_distance_sum(combination_tuple, distance_matrix)

    if rows_filled < top_n:
        top_combinations_array[rows_filled, :best_k] = combination_tuple
        top_combinations_array[rows_filled, best_k:best_k*2] = weights
        top_combinations_array[rows_filled, best_k*2: num_metrics] = [std, er, sharpe, rating]
        rows_filled += 1
        if rows_filled == top_n:
            top_combinations_array = sort_top_combinations(top_combinations_array, -1)

    else:
        if rating > top_combinations_array[-1, -1]:
            top_combinations_array[rows_filled-1, :best_k] = combination_tuple
            top_combinations_array[rows_filled-1, best_k:best_k*2] = weights
            top_combinations_array[rows_filled-1, best_k*2: num_metrics] = [std, er, sharpe, rating]
            top_combinations_array = sort_top_combinations(top_combinations_array, -1)


In [None]:
# Remove NaN rows before further processing
top_combinations_array_cleaned = top_combinations_array[~np.isnan(top_combinations_array[:, best_k])]

print("Top", top_n, "Combinations by Sharpe Ratio:")
for row in top_combinations_array_cleaned:
    combination_indices = row[:best_k].astype(int)
    asset_symbols = [index2symbol[index] for index in combination_indices]
    # asset_symbols = [index for index in combination_indices]
    weights, std, er, sharpe, rating = row[best_k:best_k+best_k], row[best_k+best_k], row[best_k+best_k+1], row[best_k+best_k+2], row[-1]
    print(f"Assets: {asset_symbols}, Weights: {weights}, Std Dev: {std:.4f}, Expected Return: {er:.4f}, Sharpe Ratio: {sharpe:.4f}, Rating: {rating:.4f}")