In [None]:
import os
from ib_async import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
import dcor
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict
import itertools
from time import sleep
import csv
from scipy.optimize import minimize
from fredapi import Fred
import pandas_datareader.data as web
import math
import re
import ast
import traceback


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
# Prep functions
def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

In [None]:
# kind = 'midpoint'
kind = 'trades'
# kind = 'indices'

if kind == 'midpoint':
    root = 'data/daily-midpoint/'
elif kind == 'trades':
    root = 'data/daily-trades/'
elif kind == 'indices':
    root = 'data/indices/'

data_path = root + 'series/'
verified_path = root + 'verified_files.csv'

if kind in ['trades', 'indices']:
    price_col = 'average'
else:
    price_col = 'close'

In [None]:
# Verify files
fund_df = load('data/fundamentals.csv')

try:
    verified_df = pd.read_csv(verified_path)
except FileNotFoundError:
    util.startLoop()
    ib = IB()
    ib.connect('127.0.0.1', 7497, clientId=2)

    file_list = os.listdir(data_path)
    verified_files = []

    for file_name in tqdm(file_list, total=len(file_list), desc="Verifying files"):
        if not file_name.endswith('.csv'):
            continue
        try:
            symbol, exchange, currency = file_name.replace('.csv', '').split('-')
            symbol_data = fund_df[(fund_df['symbol'] == symbol) & (fund_df['currency'] == currency)]
            if symbol_data.empty:
                continue

            contract_details = ib.reqContractDetails(Stock(symbol, exchange, currency))
            if not contract_details:
                continue
            isin = contract_details[0].secIdList[0].value

            if symbol_data['isin'].iloc[0] != isin:
                continue

            instrument_name = symbol_data['longName'].iloc[0].replace('-', '').replace('+', '')
            leveraged = any(
                re.fullmatch(r'\d+X', word) and int(word[:-1]) > 1 or word.lower().startswith(('lv', 'lev'))
                for word in instrument_name.split()
            )
            if leveraged:
                continue

            verified_files.append({'symbol': symbol, 'currency': currency})
        except ValueError as e:
            print(f"Invalid filename format {file_name}: {e}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    verified_df = pd.DataFrame(verified_files)
    verified_df.to_csv(verified_path, index=False)

    ib.disconnect()

### Merge historical series with fundamentals

In [None]:
def ensure_series_types(df, price_col):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    for col in ['volume', price_col]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def validate_raw_prices(df, price_col):
    invalid_price_mask = df[price_col] <= 0
    inconsistent_mask = pd.Series(False, index=df.index)
    if 'low' in df.columns and 'high' in df.columns:
        inconsistent_mask = (df['low'] > df['high'])

    local_error_mask = invalid_price_mask | inconsistent_mask
    df = df[~local_error_mask].copy()
    
    return df

def handle_stale_periods(df, price_col, max_stale_days=5):
    stale_groups = (df[price_col].diff() != 0).cumsum()
    if stale_groups.empty:
        return df
    
    period_lengths = df.groupby(stale_groups)[price_col].transform('size')
    long_stale_mask = period_lengths > max_stale_days
    
    is_intermediate_stale_row = (stale_groups.duplicated(keep='first') & 
                             stale_groups.duplicated(keep='last'))
    
    rows_to_drop_mask = long_stale_mask & is_intermediate_stale_row
    df = df[~rows_to_drop_mask].copy()
        
    return df

In [None]:
# Load historical series
if 'copied' not in globals() or input('reload csvs? (y/n)').lower() == 'y':
    latest = (datetime.now() - timedelta(days=365 * 6))
    meta = []
    file_list = os.listdir(data_path)
    for file in tqdm(file_list, total=len(file_list)):
        if not file.endswith('.csv'):
            continue
        
        parts = os.path.splitext(file)[0].split('-')
        symbol, exchange, currency = parts[0], parts[1], parts[2]
        if not ((verified_df['symbol'] == symbol) & (verified_df['currency'] == currency)).any():
            continue
        
        # Load and clean raw series
        try:
            df = load(data_path + file)
            df = ensure_series_types(df, price_col)
            df = validate_raw_prices(df, price_col)
            df = handle_stale_periods(df, price_col)
            # df = adjust_for_splits(df, price_col)

            df['pct_change'] = df[price_col].pct_change()
            if df['date'].max() > latest:
                latest = df['date'].max()

            meta.append({
                'symbol': symbol,
                'currency': currency,
                'exchange_api': exchange,
                'df': df[['date', price_col, 'volume', 'pct_change']],
            })
        except Exception as e:
            print(f"ERROR {file}: {e}")
            
    meta = pd.DataFrame(meta)
    copied = meta.copy()
    copied['df'] = copied['df'].apply(lambda x: x.copy()) 

In [None]:
# RESET
meta = copied.copy()
meta['df'] = copied['df'].apply(lambda x: x.copy()) 

In [None]:
def detect_and_nullify_global_outliers(meta_df, price_col, z_threshold=120.0, window=5):
    all_pct_changes = pd.concat(
        [row['df']['pct_change'] for _, row in meta_df.iterrows()],
        ignore_index=True
    ).dropna()
    all_pct_changes = all_pct_changes[~np.isinf(all_pct_changes) & (all_pct_changes != 0)]

    global_median_return = all_pct_changes.median()
    global_mad = (all_pct_changes - global_median_return).abs().median()

    outlier_series = {}
    for idx, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
        df = row['df']
        df = df.reset_index(drop=True)
        if df['pct_change'].isnull().all():
            continue
        cols_to_null = [price_col, 'volume', 'high', 'low', 'pct_change']
        cols_to_null = [c for c in cols_to_null if c in df.columns]

        absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
        outlier_mask = absolute_modified_z > z_threshold
        
        if outlier_mask.any():
            data_dict = absolute_modified_z[outlier_mask].describe()

            candidate_indices = df.index[outlier_mask]
            for df_idx in candidate_indices:
                price_to_check_idx = df_idx - 1
                price_to_check = df.loc[price_to_check_idx, price_col]
                local_window_start = max(0, price_to_check_idx - window)
                local_window = df.loc[local_window_start : price_to_check_idx - 1, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std != 0: 
                    price_z_score = abs(price_to_check - local_mean) / local_std
                    if price_z_score > z_threshold / 10:
                        df.loc[price_to_check_idx, cols_to_null] = np.nan

                price_to_check = df.loc[df_idx, price_col]
                local_window_end = min(df_idx + window, df.index[outlier_mask].max())
                local_window = df.loc[df_idx + 1: local_window_end, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std != 0:
                    price_z_score = abs(price_to_check - local_mean) / local_std
                    if price_z_score > z_threshold / 10:
                        df.loc[df_idx, cols_to_null] = np.nan


            data_dict['new_length'] = len(df)
            outlier_series[row['symbol']] = data_dict
            
            df['pct_change'] = df[price_col].pct_change(fill_method=None)
            
            meta_df.at[idx, 'df'] = df

    return outlier_series

z_threshold = 50
window = 5
modified_series_info = detect_and_nullify_global_outliers(meta, price_col=price_col, z_threshold=z_threshold, window=window)


In [None]:
# # Check global outliers ### NATURALLY PLOTS A FEW STRAGLERS
# def global_return_filter(meta_df, z_threshold=120.0):
#     all_pct_changes = pd.concat(
#         [row['df']['pct_change'] for _, row in meta_df.iterrows()],
#         ignore_index=True
#     )
#     all_pct_changes.dropna(inplace=True)
#     all_pct_changes = all_pct_changes[~np.isinf(all_pct_changes) & (all_pct_changes != 0)]

#     global_median_return = all_pct_changes.median()
#     global_mad = (all_pct_changes - global_median_return).abs().median()

#     outlier_series = {}
#     for _, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
#         df = row['df']        
#         absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
#         if absolute_modified_z.max() > z_threshold:
#             outlier_series[row['symbol']] = absolute_modified_z.describe()

#     return outlier_series, global_mad, global_median_return

# z_threshold = 50
# globally_defective_symbols, global_mad, global_median_return = global_return_filter(meta, z_threshold=z_threshold)
# globally_defective_symbols = pd.DataFrame(globally_defective_symbols)

# meta_indexed = meta.set_index('symbol')
# for symbol in globally_defective_symbols.T.sort_values(by='max', ascending=True).index.tolist():
#     df = meta_indexed.loc[symbol, 'df'].copy()
#     df = df.reset_index(drop=True)

#     absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
#     outlier_mask = absolute_modified_z > z_threshold

#     corrected_outlier_mask = pd.Series(False, index=df.index)
#     for df_idx in df.index[outlier_mask]:
#         # Check data points before
#         price_to_check_idx = df_idx - 1
#         price_to_check = df.at[price_to_check_idx, price_col]
#         local_window_start = max(0, price_to_check_idx - window)
#         local_window = df.loc[local_window_start : price_to_check_idx - 1, price_col].dropna()
#         local_mean = local_window.mean()
#         local_std = local_window.std()
#         if local_std != 0:
#             price_z_score = abs(price_to_check - local_mean) / local_std
#             if price_z_score > z_threshold / 10:
#                 corrected_outlier_mask.at[price_to_check_idx] = True

#         # Check data points after
#         price_to_check = df.at[df_idx, price_col]
#         local_window_end = min(df_idx + window, df.index[outlier_mask].max())
#         local_window = df.loc[df_idx + 1: local_window_end, price_col].dropna()
#         local_mean = local_window.mean()
#         local_std = local_window.std()
#         if local_std != 0:
#             price_z_score = abs(price_to_check - local_mean) / local_std
#             if price_z_score > z_threshold / 10:
#                 corrected_outlier_mask.at[df_idx] = True

#     if corrected_outlier_mask.any():
#         # Plotting
#         plt.figure(figsize=(10, 6))
#         plt.plot(df['date'], df[price_col], marker='o', label='Normal')
#         plt.scatter(df.loc[corrected_outlier_mask, 'date'],
#                     df.loc[corrected_outlier_mask, price_col],
#                     color='red', label='Outlier', zorder=5)

#         plt.title(f"Symbol: {symbol}")
#         plt.xlabel("Date")
#         plt.ylabel(price_col)
#         plt.legend()
#         plt.tight_layout()
#         plt.show()

In [None]:
# Calculate series gap stats
oldest = latest - pd.Timedelta(days=365 * 6)
business_days = pd.date_range(start=oldest, end=latest, freq='B')

# Calculate statistics for each DataFrame in meta
for idx, row in tqdm(meta.iterrows(), total=len(meta)):
    df = row['df']
    merged = pd.merge(pd.DataFrame({'date': business_days}), df, on='date', how='left')
    
    # Calculate gaps
    present = merged[price_col].notna()
    present_idx = np.flatnonzero(present)
    gaps = []
    length = len(merged)

    if present_idx.size > 0:
        if present_idx[0] > 0:
            gaps.append(present_idx[0])
        if present_idx.size > 1:
            internal_gaps = np.diff(present_idx) - 1
            gaps.extend(gap for gap in internal_gaps if gap > 0)
        if present_idx[-1] < length - 1:
            gaps.append(length - 1 - present_idx[-1])
    else:
        gaps = [length]

    gaps = np.array(gaps, dtype=int)
    gaps = gaps[gaps > 0]
    max_gap = float(gaps.max()) if gaps.size > 0 else 0.0
    std_gap = float(gaps.std()) if gaps.size > 0 else 0.0
    missing = length - present.sum()
    pct_missing = missing / length

    # Update meta with statistics
    meta.at[idx, 'df'] = merged
    meta.at[idx, 'max_gap'] = max_gap
    meta.at[idx, 'missing'] = missing
    meta.at[idx, 'pct_missing'] = pct_missing

print(f'Latest: {latest}')
print(f'Oldest: {oldest}')

In [None]:
# Remove series with large day gaps
meta['max_gap_log'] = np.log1p(meta['max_gap'])
meta['max_gap_log'] = meta['max_gap_log'] / meta['max_gap_log'].max()
meta['exclusion_score'] = meta['pct_missing'] + meta['max_gap_log']

condition = ((meta['max_gap_log'] < meta['max_gap_log'].mean()) & 
             (meta['pct_missing'] < meta['pct_missing'].mean()))
filtered = meta[condition].sort_values(by='exclusion_score', ascending=False).copy()

In [None]:
# Interpolate/extrapolate price column and merge with fund
for idx, row in tqdm(filtered.iterrows(), total=len(filtered)):
    df = row['df']
    df[price_col] = df[price_col].interpolate(method='akima', limit_direction='both')
    if df[price_col].isna().any():
        df[price_col] = df[price_col].ffill()
        df[price_col] = df[price_col].bfill()
    
    df['pct_change'] = df[price_col].pct_change()
    filtered.at[idx, 'df'] = df.set_index('date')

filtered = pd.merge(filtered, fund_df, on=['symbol', 'currency'], how='inner').drop(['max_gap', 'missing', 'pct_missing', 'exclusion_score', 'max_gap_log'], axis=1)

del meta, fund_df

In [None]:
# # Delete duplicates ETF files
# duplicates = meta[meta.duplicated(subset=['symbol', 'currency'], keep=False)].copy()
# duplicates['not_smart'] = duplicates['exchange_api'] != 'SMART'

# sorted_duplicates = duplicates.sort_values(
#     by=['symbol', 'currency', 'length', 'not_smart'],
#     ascending=[True, True, False, False]
# )

# rows_to_keep = sorted_duplicates.groupby(['symbol', 'currency']).head(1)
# rows_to_delete = duplicates[~duplicates.index.isin(rows_to_keep.index)]
# for idx, row in rows_to_delete.iterrows():
#     file_name = f"{row['symbol']}-{row['exchange_api']}-{row['currency']}.csv"
#     file_path = os.path.join(data_path, file_name)
#     if os.path.exists(file_path):
#         os.remove(file_path)
#         print(f"Deleted {file_path}")
#     else:
#         print(f"File not found: {file_path}")

# del duplicates, sorted_duplicates, rows_to_keep, rows_to_delete

# Plot asset class portfolios

In [None]:
# Risk-free series calculation
import pandas as pd
import pandas_datareader.data as web

# 3-month bill/interest rate tickers (FRED/OECD) for each country
tickers = {
    'US': 'DTB3',
    'Canada': 'IR3TIB01CAM156N',
    'Germany': 'IR3TIB01DEM156N',
    'UK': 'IR3TIB01GBM156N',
    'France': 'IR3TIB01FRA156N',
}

# Fetch each series and convert from percentage to decimal
bonds = {}
failed = []
for country, ticker in tickers.items():
    try:
        series = web.DataReader(ticker, 'fred', oldest, latest)
        bonds[country] = series / 100.0
    except Exception:
        try:
            series = web.DataReader(ticker, 'oecd', oldest, latest)
            bonds[country] = series / 100.0
        except Exception as oecd_err:
            failed.append(country)

# Combine into a single DataFrame
df_bonds = pd.concat(bonds, axis=1)
df_bonds.columns = [c for c in tickers if c not in failed]
df_bonds = df_bonds.interpolate(method='akima').bfill().ffill()

fred = Fred(api_key='30ae0e4e7713662116edf836cec71562')
cpi_data = fred.get_series('CPIAUCSL', oldest, latest)
risk_free_df = pd.concat([df_bonds.mean(axis=1).rename('nominal_rate'), cpi_data.rename('cpi')], axis=1)

# Match with the other price series
risk_free_df = risk_free_df.reindex(business_days, copy=False)
risk_free_df = risk_free_df.interpolate(method='akima').bfill().ffill()

risk_free_df['inflation_rate'] = risk_free_df['cpi'].pct_change()
risk_free_df['daily_nominal_rate'] = risk_free_df['nominal_rate'] / 252
# risk_free_df['real_rate'] = (1 + risk_free_df['daily_nominal_rate']) / (1 + risk_free_df['inflation_rate']) - 1

print(f'Short-term bonds used from: {df_bonds.columns.to_list()}')

In [None]:
# Add pct_change cols to dfs and create pct_changes
cols_to_exclude = ['conId']
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

pct_changes = pd.concat(
        [row['df']['pct_change'].rename(row['conId']) 
        for _, row in filtered.iterrows()], axis=1
    )

# Remove uninformative cols for market portfolios 
uninformative_cols = [col for col in numerical_cols if filtered[col].nunique(dropna=True) <= 1]
filtered = filtered.drop(columns=uninformative_cols)
filtered = filtered.dropna(axis=1, how='all')

In [None]:
# Add rate of change fundamentals
def calculate_slope(value1, value2, time1, time2):
    return (value1 - value2) / (time1 - time2)


rate_fundamentals = [('EPSGrowth-1yr', 'EPS_growth_3yr', 'EPS_growth_5yr'),
                     ('ReturnonAssets1Yr', 'ReturnonAssets3Yr'),
                     ('ReturnonCapital', 'ReturnonCapital3Yr'),
                     ('ReturnonEquity1Yr', 'ReturnonEquity3Yr'),
                     ('ReturnonInvestment1Yr', 'ReturnonInvestment3Yr')]

for cols in rate_fundamentals:
    base_name = cols[0].replace('-1yr', '').replace('1Yr', '')
    slope_col = f'fundamentals_{base_name}_slope'
    
    if len(cols) == 3:
        col_1yr, col_3yr, col_5yr = cols

        filtered[slope_col] = calculate_slope(
            filtered[f'fundamentals_{col_1yr}'],
            filtered[f'fundamentals_{col_5yr}'],
            1, 5
        )

        if 'EPS' in base_name:
            slope_1yr_3yr = calculate_slope(
                filtered[f'fundamentals_{col_1yr}'],
                filtered[f'fundamentals_{col_3yr}'],
                1, 3
            )
            slope_3yr_5yr = calculate_slope(
                filtered[f'fundamentals_{col_3yr}'],
                filtered[f'fundamentals_{col_5yr}'],
                3, 5
            )
            
            second_deriv_col = f'fundamentals_{base_name}_second_deriv'
            filtered[second_deriv_col] = calculate_slope(
                slope_1yr_3yr,
                slope_3yr_5yr,
                1, 3
            )
    elif len(cols) == 2:
        col_1yr, col_3yr = cols
        filtered[slope_col] = calculate_slope(
            filtered[f'fundamentals_{col_1yr}'],
            filtered[f'fundamentals_{col_3yr}'],
            1, 3
        )

# Add new cols to numericals
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

In [None]:
# ETF return stats and split training and tests sets
def get_return_stats(df, training_cutoff, momentum_cutoffs, risk_free_df):
    training_df = df[df.index < training_cutoff]
    training_rf = risk_free_df[risk_free_df.index < training_cutoff]

    excess_returns = training_df['pct_change'] - training_rf['daily_nominal_rate']
    sharpe = excess_returns.mean() / excess_returns.std()

    er = training_df['pct_change'].mean()
    std = training_df['pct_change'].std()
    avg_volume = training_df['volume'].mean()

    momentum_3mo = training_df[training_df.index >= momentum_cutoffs['3mo']]['pct_change'].mean()
    momentum_6mo = training_df[training_df.index >= momentum_cutoffs['6mo']]['pct_change'].mean()
    momentum_1y  = training_df[training_df.index >= momentum_cutoffs['1y']]['pct_change'].mean()
    momentum_3y  = training_df[training_df.index >= momentum_cutoffs['3y']]['pct_change'].mean()

    return pd.Series(
        [momentum_3mo, momentum_6mo, momentum_1y, momentum_3y, er, sharpe, std, avg_volume],
        index=['momentum_3mo', 'momentum_6mo', 'momentum_1y', 'momentum_3y', 'stats_er', 'stats_sharpe', 'stats_std', 'stats_avg_volume']
    )

training_cutoff = latest - pd.Timedelta(days=365)
momentum_cutoffs = {
    '3y':  training_cutoff - pd.Timedelta(days=365 * 3),
    '1y':  training_cutoff - pd.Timedelta(days=365),
    '6mo': training_cutoff - pd.Timedelta(days=365 // 2),
    '3mo': training_cutoff - pd.Timedelta(days=365 // 4),
}

# Apply to each row
filtered[['momentum_3mo', 'momentum_6mo', 'momentum_1y', 'momentum_3y', 'stats_er', 'stats_sharpe', 'stats_std', 'stats_avg_volume']] = filtered['df'].apply(lambda df: get_return_stats(df, training_cutoff, momentum_cutoffs, risk_free_df))


In [None]:
# Create all asset type indices/portfolios
import matplotlib.pyplot as plt

holding_cols = [col for col in filtered.columns if col.startswith('holding_') and col != 'holding_types_variety'] + ['total']
portfolio_dfs = {}

for holding_col in holding_cols:
    name = holding_col.split('_')[-1]
    if holding_col == 'total':
        weight = filtered['profile_cap_usd']
    else:
        weight = (filtered['profile_cap_usd'] * filtered[holding_col])
 
    total_market_cap = (weight).sum()
    filtered['weight'] = weight / total_market_cap
    
    weights = filtered.set_index('conId')['weight']
    portfolio_return = pct_changes.dot(weights)
    initial_price = 1
    portfolio_price = initial_price * (1 + portfolio_return.fillna(0)).cumprod()

    portfolio_df = pd.DataFrame({
        'date': portfolio_price.index,
        price_col: portfolio_price.values,
        'pct_change': portfolio_return.values
    }).set_index('date')

    portfolio_dfs[name] = portfolio_df

    plt.figure(figsize=(10, 6))
    plt.title(f'{name.capitalize()} portfolio  -  ${format(total_market_cap, ',.0f')}')
    plt.plot(portfolio_df.index, portfolio_df[price_col], marker='o')
    plt.show()

filtered.drop('weight', axis=1, inplace=True)

In [None]:
# # Manual plot
# symbol_test = 'SHV'
# x = filtered[filtered['symbol'] == symbol_test].df.iloc[0].index
# y = filtered[filtered['symbol'] == symbol_test].df.iloc[0]['average']#.pct_change()

# y = risk_free_df['daily_nominal_rate']
# x = risk_free_df.index
# # y = df_bonds['UK']
# # x = df_bonds.index
# # y = portfolio_dfs['equity']['pct_change'] - risk_free_df['daily_nominal_rate']
# # x = portfolio_dfs['equity'].index

# plt.figure(figsize=(10, 6))
# plt.plot(x, y, marker='o')
# # plt.xlim(market_portfolio_df['date'].min(), market_portfolio_df['date'].max())
# plt.show()

In [None]:
# Avoid dummy trap
empty_subcategories = {
'holding_types': ['other'],
'countries': ['Unidentified'], 
'currencies': ['<NoCurrency>'],
'industries': ['NonClassifiedEquity', 'NotClassified-NonEquity'],
'top10': ['OtherAssets', 'AccountsPayable','AccountsReceivable','AccountsReceivable&Pay','AdministrationFees','CustodyFees','ManagementFees','OtherAssetsandLiabilities','OtherAssetslessLiabilities', 'OtherFees','OtherLiabilities','Tax','Tax--ManagementFees'],
'debtors': ['OTHER'],
'maturity': ['%MaturityOther'],
'debt_type': ['%QualityNotAvailable', '%QualityNotRated'],
'manual': ['other']
}

dummy_trap_cols = []
for k, lst in empty_subcategories.items():
    for i in lst:
        if k == 'manual':
            dummy_trap_cols.append(i)
        else:
            dummy_trap_cols.append(f'{k}_{i}')
    
filtered = filtered.drop(columns=dummy_trap_cols, axis=1, errors='ignore')
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

In [None]:
# Reorganize columns
columns_to_move = ['bond', 'equity', 'cash', 'other', 'tradable']
categories = ['holding_types', 'stats', 'momentum', 'profile', 'top10', 'countries', 'fundamentals', 'industries', 'currencies', 'debtors', 'maturity', 'debt_type', 'lipper', 'dividends', 'marketcap', 'style', 'domicile', 'asset']

metadata = [col for col in numerical_cols if col not in ['conId']]
non_numerical = [col for col in filtered.columns if col not in metadata]

for category in reversed(categories):
    cat_cols = [col for col in metadata if col.startswith(category)]
    remaining = [col for col in metadata if col not in cat_cols]
    metadata = cat_cols + remaining

new_column_order = non_numerical + metadata
filtered = filtered[new_column_order]

# Regression analysis

In [None]:
def construct_long_short_factor_returns(full_meta_df, returns_df, long_symbols, short_symbols, factor_column=None):
    long_df = full_meta_df[full_meta_df['conId'].isin(long_symbols)].set_index('conId')
    long_weights = long_df['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
    if long_weights.mean() == 0:
        print(f'Long {factor_column}')
        print(long_df.index)
        print()
    if factor_column:
        factor_weights = (full_meta_df[factor_column].max() - long_df[factor_column]) / (full_meta_df[factor_column].max() - full_meta_df[factor_column].min())
        factor_weights = factor_weights.reindex(returns_df.columns).fillna(0)
        if factor_weights.sum() != 0:
            long_weights *= factor_weights

    long_weights /= long_weights.sum()
    long_returns = returns_df.dot(long_weights)
    
    short_df = full_meta_df[full_meta_df['conId'].isin(short_symbols)].set_index('conId')
    short_weights = short_df['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
    if short_weights.mean() == 0:
        print(f'Short {factor_column}')
        print(short_df.index)
        print()
    if factor_column:
        factor_weights = (short_df[factor_column] - full_meta_df[factor_column].min()) / (full_meta_df[factor_column].max() - full_meta_df[factor_column].min())
        factor_weights = factor_weights.reindex(returns_df.columns).fillna(0)
        if factor_weights.sum() != 0:
            short_weights *= factor_weights

    short_weights /= short_weights.sum()
    short_returns = returns_df.dot(short_weights)
    
    factor_returns = long_returns - short_returns
    return factor_returns

In [None]:
# Construct all factors from metadata + a few custom factors

differences = []
def construct_factors(filtered, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=0.5):
    factors = {}
    # Market risk premium
    factors['market_premium'] = (portfolio_dfs['equity']['pct_change'] - risk_free_df['daily_nominal_rate'])

    # SMB_ETF
    small_symbols = filtered[filtered['marketcap_small'] == 1]['conId'].tolist()
    large_symbols = filtered[filtered['marketcap_large'] == 1]['conId'].tolist()

    intersection = set(small_symbols) & set(large_symbols)
    small_symbols = [s for s in small_symbols if s not in intersection]
    large_symbols = [s for s in large_symbols if s not in intersection]
    smb_etf = construct_long_short_factor_returns(filtered, pct_changes, small_symbols, large_symbols)
    factors['smb'] = smb_etf

    differences.append(np.abs(len(small_symbols) - len(large_symbols)))

    # HML_ETF
    value_cols = [col for col in filtered.columns if col.startswith('style_') and col.endswith('value')]
    growth_cols = [col for col in filtered.columns if col.startswith('style_') and col.endswith('growth')]
    value_symbols = filtered[filtered[value_cols].ne(0).any(axis=1)]['conId'].tolist()
    growth_symbols = filtered[filtered[growth_cols].ne(0).any(axis=1)]['conId'].tolist()

    intersection = set(value_symbols) & set(growth_symbols)
    value_symbols = [s for s in value_symbols if s not in intersection]
    growth_symbols = [s for s in growth_symbols if s not in intersection]
    hml_etf = construct_long_short_factor_returns(filtered, pct_changes, value_symbols, growth_symbols)
    factors['hml'] = hml_etf

    differences.append(np.abs(len(value_symbols) - len(growth_symbols)))

    # Metadata
    metadata = [col for col in numerical_cols if col not in ['conId']]
    for col in numerical_cols:
        std = filtered[col].std()
        mean = filtered[col].mean()

        upper_boundary = min(filtered[col].max(), mean + (scaling_factor * std))
        lower_boundary = max(filtered[col].min(), mean - (scaling_factor * std))

        low_factor_symbols = filtered[filtered[col] <= lower_boundary]['conId'].tolist()
        high_factor_symbols = filtered[filtered[col] >= upper_boundary]['conId'].tolist()

        var_etf = construct_long_short_factor_returns(filtered, pct_changes, low_factor_symbols, high_factor_symbols, factor_column=col)
        factors[col] = var_etf

        differences.append(np.abs(len(low_factor_symbols) - len(high_factor_symbols)))

    return pd.DataFrame(factors)#.fillna(0)

factors_df = construct_factors(filtered, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=0.5) # Optimized

In [None]:
def prescreen_factors(factors_df, correlation_threshold=0.80):
    if factors_df is None or factors_df.empty or factors_df.shape[1] == 0:
        raise ValueError("factors_df must be a non-empty DataFrame with at least one column.")
    temp_factors_df = factors_df.copy()

    corr_matrix = temp_factors_df.corr().abs()
    corr_pairs = corr_matrix.where(np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)).stack()
    corr_pairs = corr_pairs.sort_values(ascending=False)

    drop_map = {}
    col_order = list(temp_factors_df.columns)
    for (col1, col2), corr_val in corr_pairs.items():
        if corr_val < correlation_threshold:
            break

        already_dropped = {c for drops in drop_map.values() for c in drops}
        if col1 in already_dropped or col2 in already_dropped:
            continue

        if col_order.index(col1) < col_order.index(col2):
            keeper, to_drop = col1, col2
        else:
            keeper, to_drop = col2, col1

        drop_map.setdefault(keeper, []).append(to_drop)

    # Merge drop results
    cols_to_drop = set(col for drops in drop_map.values() for col in drops)
    final_drop_map = {}
    for keeper, direct_drops in drop_map.items():
        if keeper not in cols_to_drop:
            cols_to_check = list(direct_drops) 
            all_related_drops = set(direct_drops)
            while cols_to_check:
                col = cols_to_check.pop(0)
                if col in drop_map:
                    new_drops = [d for d in drop_map[col] if d not in all_related_drops]
                    cols_to_check.extend(new_drops)
                    all_related_drops.update(new_drops)
            
            final_drop_map[keeper] = sorted(list(all_related_drops))

    temp_factors_df = temp_factors_df.drop(columns=cols_to_drop)
    return temp_factors_df, final_drop_map

distilled_factors, col_map = prescreen_factors(factors_df, correlation_threshold=0.99)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)

    
# max_vif_threshold = 99999
# while True:
#     vif_df = calculate_vif(distilled_factors.fillna(0))
#     highest_vif = vif_df['VIF'].iloc[0]
#     if highest_vif > max_vif_threshold and distilled_factors.shape[1] > 2:
#         feature_to_drop = vif_df['feature'].iloc[0]
#         distilled_factors.drop(columns=[feature_to_drop], inplace=True)
#         cols_dropped.add(feature_to_drop)
#         print(f'{feature_to_drop} - {highest_vif}')
#     else:
#         break

In [None]:
scales = {}
z_range = np.arange(0,2,0.05)
for z in tqdm(z_range, total=len(z_range)):
    factors_df = construct_factors(filtered, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=z)
    distilled_factors, _ = prescreen_factors(factors_df, correlation_threshold=0.99)

    vif_df = calculate_vif(distilled_factors.fillna(0))
    scales[z] = vif_df['VIF'].iloc[0]

x = [z for z in scales]
y = [vif for vif in scales.values()]

plt.figure(figsize=(10, 6))
plt.plot(x, y, marker='o')
plt.show()

In [None]:
# OLS Regression function
import statsmodels.api as sm

def run_regressions(distilled_factors):
    results = []
    for symbol in tqdm(pct_changes.columns, desc="Running Regressions"):
        etf_excess = pct_changes[symbol] - risk_free_df['daily_nominal_rate']
        data = pd.concat([etf_excess.rename('etf_excess'), distilled_factors], axis=1).dropna()

        Y = data['etf_excess']
        X = sm.add_constant(data.iloc[:, 1:])
        model = sm.OLS(Y, X).fit()
        result = {
            'conId': symbol,
            'nobs': model.nobs,
            'r_squared': model.rsquared,
            'r_squared_adj': model.rsquared_adj,
            'f_statistic': model.fvalue,
            'f_pvalue': model.f_pvalue,
            'aic': model.aic,
            'bic': model.bic,
            'condition_number': model.condition_number,
            'alpha': model.params['const'],
            'alpha_pval': model.pvalues['const'],
            'alpha_tval': model.tvalues['const'],
            'alpha_bse': model.bse['const'],
        }
        for factor in distilled_factors.columns:
            result[f'beta_{factor}'] = model.params[factor]
            result[f'pval_beta_{factor}'] = model.pvalues[factor]
            result[f'tval_beta_{factor}'] = model.tvalues[factor]
            result[f'bse_beta_{factor}'] = model.bse[factor]
        results.append(result)

    results_df = pd.DataFrame(results)
    return results_df
    # del X, Y, model, data, etf_excess, result, results

In [None]:
# PCR
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from tqdm.auto import tqdm

def run_pcr(factors_df,
            pct_changes,
            risk_free_df,
            cv=5,
            random_state=42):

    data = factors_df.copy().fillna(0)
    X = data[factors_df.columns]
    n_samples, n_factors = X.shape
    search_limit = min(n_factors, int(n_samples * (1 - 1/cv)) - 1)

    summary_rows = []
    pcr_cv_scores = {}
    neg_mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # GridSearchCV maximizes the score, so we use negative MSE
    for etf in tqdm(pct_changes.columns, desc="PCR Regression"):
        Y = data[etf].values - risk_free_df['daily_nominal_rate'].values

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(random_state=random_state)),
            ('reg', LinearRegression())
        ])

        param_grid = {
            'pca__n_components': list(range(1, search_limit + 1))
        }

        grid_search = GridSearchCV(estimator=pipeline,
                                   param_grid=param_grid,
                                   scoring=neg_mse_scorer,
                                   cv=cv,
                                   n_jobs=-1)

        try:
            grid_search.fit(X.values, Y)

            best_pipeline = grid_search.best_estimator_
            best_n_components = grid_search.best_params_['pca__n_components']
            intercept = best_pipeline.named_steps['reg'].intercept_
            pc_coeffs = best_pipeline.named_steps['reg'].coef_

            pca_loadings = best_pipeline.named_steps['pca'].components_

            # Calculate effective coefficients on the original factors
            effective_betas_scaled = pca_loadings.T @ pc_coeffs
            pipeline_scaler = best_pipeline.named_steps['scaler']
            effective_betas_unscaled = effective_betas_scaled / pipeline_scaler.scale_

            # Store CV results for this asset
            cv_results_df = pd.DataFrame(grid_search.cv_results_)
            mean_mse_scores = -cv_results_df.set_index(cv_results_df['param_pca__n_components'])['mean_test_score']
            pcr_cv_scores[etf] = mean_mse_scores
            best_cv_mse = mean_mse_scores.loc[best_n_components]

            row = {
                'conId': etf,
                'intercept': intercept,
                'best_n_components': best_n_components,
                'cv_mse_mean': best_cv_mse
            }

            for coef, fname in zip(effective_betas_unscaled, factors_df.columns):
                row[f'coef_{fname}'] = coef

            summary_rows.append(row)

        except Exception as e:
            print(f"Skipping {etf} due to fitting error: {e}")
            continue

    summary_df = pd.DataFrame(summary_rows)
    return summary_df, pcr_cv_scores

# results_pcr, pcr_cv_stats = run_pcr(
#     factors_df=distilled_factors,
#     pct_changes=pct_changes,
#     risk_free_df=risk_free_df,
#     cv=4
# )

In [None]:
metrics[0]['l1_ratio']

In [None]:
# ElasticNet regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

metrics = []

def run_elastic_net(factors_df,
                    pct_changes,
                    risk_free_df,
                    training_cutoff,
                    alphas=np.logspace(-4, 1, 50),
                    l1_ratio=[.1, .5, .9],
                    cv=5,
                    random_state=42):

    global metrics

    data = data = (
        factors_df.copy()
        .join(pct_changes, how='inner')
        .join(risk_free_df[['daily_nominal_rate']], how='inner')
        .fillna(0)
    )

    train = data[data.index < training_cutoff]
    test = data[data.index >= training_cutoff]

    X_train = train[factors_df.columns].values
    X_test = test[factors_df.columns].values
    
    # metrics = []
    for etf in tqdm(pct_changes.columns, desc="Elastic Net Regression"):
        Y_train = train[etf].values - train['daily_nominal_rate'].values
        Y_test = test[etf].values - test['daily_nominal_rate'].values

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('enet', ElasticNetCV(alphas=alphas,
                             l1_ratio=l1_ratio,
                             cv=cv,
                             random_state=random_state,
                             max_iter=499999,
                            #  tol=5e-5,
                             fit_intercept=True,
                             n_jobs=-1)),
        ])

        try:
            pipeline.fit(X_train, Y_train)
        except ValueError as e:
            print(f"Skipping {etf} due to error: {e}")
            continue

        # Unscale coefficients and intercept
        enet = pipeline.named_steps['enet']
        scaler = pipeline.named_steps['scaler']
        betas_train = enet.coef_ / scaler.scale_
        intercept = enet.intercept_ - np.dot(betas_train, scaler.mean_)

        # out-of-sample stats
        er_test = pipeline.predict(X_test)

        # in-sample stats
        er_train = pipeline.predict(X_train)

        row = {
            'conId': etf,
            'alpha': intercept,
            'enet_alpha': enet.alpha_,
            'l1_ratio': enet.l1_ratio_,
            'n_iter': enet.n_iter_,
            'dual_gap': enet.dual_gap_,
            'n_nonzero': np.sum(np.abs(betas_train) > 1e-6),
            'mse_path_grid': enet.mse_path_,
            'cv_mse_best': np.min(enet.mse_path_.mean(axis=2)),
            'cv_mse_average': np.mean(enet.mse_path_.mean(axis=2)),
            'cv_mse_worst': np.max(enet.mse_path_.mean(axis=2)),
            'mse_test' : mean_squared_error(Y_test, er_test),
            'mse_train' : mean_squared_error(Y_train, er_train),
            'r2_test' : r2_score(Y_test, er_test),
            'r2_train' : r2_score(Y_train, er_train),
        }

        # Map back coefficients to factor names
        for coef, fname in zip(betas_train, factors_df.columns):
            row[f'beta_{fname}'] = coef

        metrics.append(row)
    
    results_df = pd.DataFrame(metrics).set_index('conId')
    return results_df

results_df = run_elastic_net(
    factors_df=distilled_factors,
    pct_changes=pct_changes,
    risk_free_df=risk_free_df,
    training_cutoff=training_cutoff,
    alphas=np.logspace(-4, 1, 50),
    l1_ratio=[0.3, 0.5, 0.7, 0.9, 0.95, 1],
    cv=5
)

In [None]:
# Compute factor premia and expected returns
factor_premia = distilled_factors.mean()

betas = (
    results_enet
    # results_pcr
    .set_index('conId')
    .filter(like='coef_')
    .rename(columns=lambda c: c.replace('coef_', ''))
)
betas = betas[distilled_factors.columns]

expected_excess = betas.dot(factor_premia) + results_enet.set_index('conId')['intercept']
expected_excess.name = 'expected_excess_return'

expected_returns_df = pd.concat([results_enet.set_index('conId'), expected_excess], axis=1)

In [None]:
corrs = {}
for i in range(90,97,2):
    start = datetime.now()
    distilled_factors = prescreen_factors(filtered, factors_df, correlation_threshold=i/100).fillna(0)

    max_vif_threshold = 2
    while True:
        vif_df = calculate_vif(distilled_factors)
        highest_vif = vif_df['VIF'].iloc[0]
        if highest_vif > max_vif_threshold and distilled_factors.shape[1] > 2:
            feature_to_drop = vif_df['feature'].iloc[0]
            distilled_factors.drop(columns=[feature_to_drop], inplace=True)
        else:
            break

    results_df = run_regressions(distilled_factors)
    corrs[i] = (distilled_factors, results_df, (datetime.now() - start).seconds)

In [None]:
import pandas as pd

stats = []

for k, v in copy.items():
    factors, results, time = v
    entry = {
        'k': float(f"0.{k}"),
        'time': time,
        'factor_count': factors.shape[-1],
        'r_squared': results.r_squared.mean(),
        'r_squared_adj': results.r_squared_adj.mean(),
        'f_statistic': results.f_statistic.mean(),
        'f_pvalue': results.f_pvalue.mean(),
        'aic': results.aic.mean(),
        'bic': results.bic.mean(),
        'condition_number': results.condition_number.mean(),
        'alpha': results.alpha.mean(),
        'alpha_pval': results.alpha_pval.mean(),
        'alpha_tval': results.alpha_tval.mean(),
    }

    pval_cols = [col for col in results.columns if col.startswith('pval')]
    entry['beta_pvals_mean'] = results[pval_cols].mean().mean()
    entry['beta_pvals_std'] = results[pval_cols].mean().std()
    entry['beta_pvals_min'] = results[pval_cols].mean().min()
    entry['beta_pvals_median'] = results[pval_cols].mean().median()
    entry['beta_pvals_max'] = results[pval_cols].mean().max()

    tval_cols = [col for col in results.columns if col.startswith('tval')]
    tvals_abs = results[tval_cols].abs().mean()
    entry['beta_tvals_mean'] = tvals_abs.mean()
    entry['beta_tvals_std'] = tvals_abs.std()
    entry['beta_tvals_min'] = tvals_abs.min()
    entry['beta_tvals_median'] = tvals_abs.median()
    entry['beta_tvals_max'] = tvals_abs.max()

    stats.append(entry)

df_stats = pd.DataFrame(stats).sort_values('k')

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid', context='notebook')

metrics_to_plot = [
    'time', 'factor_count', 'r_squared', 'r_squared_adj', 'f_statistic',
    'f_pvalue', 'aic', 'bic', 'condition_number', 'alpha', 'alpha_pval', 'alpha_tval',
    'beta_pvals_mean', 'beta_pvals_std', 'beta_pvals_min', 'beta_pvals_median', 'beta_pvals_max',
    'beta_tvals_mean', 'beta_tvals_std', 'beta_tvals_min', 'beta_tvals_median', 'beta_tvals_max'
]

# Create subplots
ncols = 3
nrows = -(-len(metrics_to_plot) // ncols)  # Ceiling division
fig, axes = plt.subplots(nrows, ncols, figsize=(18, 5 * nrows))
axes = axes.flatten()

for ax, metric in zip(axes, metrics_to_plot):
    sns.lineplot(data=df_stats, x='k', y=metric, marker='o', ax=ax)
    ax.set_title(metric)
    ax.set_xlabel('Correlation threshold (k)')
    ax.set_ylabel(metric)

plt.tight_layout()
plt.show()

In [None]:
'''
high r2 => proportion of volatility explained by factors
low factor pval => high stat significance
beta == 1 => equal movement/sensitivity to benchmark
beta > 1 => higher sensitivity
beta < 1 => lower sensitivity
beta < 0 => inverse sensitivity
aic lower => better overall fit
bic lower => better overall fit
condition number => indicates presence of multicollinearity
'''

In [None]:
factor_premia = factors_df.mean() * 252
annual_risk_free_rate = risk_free_df['daily_nominal_rate'].mean() * 252

try:
    results_df = results_df.set_index('conId')
except Exception:
    pass
factor_names = factor_premia.index

# Calculate expected returns for each ETF
expected_returns = {}
for symbol in results_df.index:
    # The regression was on excess returns, so the intercept (alpha) is already excess of Rf
    # However, standard practice is to add alpha to the factor-based return.
    # E[Ri - Rf] = alpha + sum(beta * lambda) => E[Ri] = Rf + alpha + sum(beta*lambda)
    
    annual_alpha = results_df.loc[symbol, 'alpha'] * 252
    if results_df.loc[symbol, 'alpha_pval'] > 0.05:
        # If alpha is not statistically significant, it's safer to treat it as zero
        annual_alpha = 0

    factor_component = 0
    for factor in factor_names:
        beta = results_df.loc[symbol, f'beta_{factor}']
        # The factor_premia is already annualized
        lambda_j = factor_premia[factor]
        factor_component += beta * lambda_j
        
    expected_returns[symbol] = annual_risk_free_rate + annual_alpha + factor_component

# Convert to a pandas Series
expected_returns_series = pd.Series(expected_returns, name='expected_annual_return')

print(expected_returns_series.head())

In [None]:
# Manual plot series
con_id = 265109869.0
df = filtered[filtered['conId'] == con_id]['df'].iloc[0].copy()
display(filtered[filtered['conId'] == con_id])

# Step 5: Forward fill missing values (optional, adjust as needed)
# df[price_col] = df[price_col].fillna(0)

plt.figure(figsize=(10, 6))
plt.plot(df.index, df[price_col], marker='o')
plt.ylim(0, df[price_col].max()*1.1)
plt.show()

## Old brownian interpolation

In [None]:
# import numpy as np

# def brownian_bridge(t, t0, t1, x0, x1, sigma):
#     """Generate points using Brownian bridge between (t0, x0) and (t1, x1)."""
#     dt = t1 - t0
#     mu = x0 + (x1 - x0) * (t - t0) / dt  # Linear interpolation for mean
#     variance = sigma**2 * (t1 - t) * (t - t0) / dt
#     return mu + np.random.normal(0, np.sqrt(variance))

# # Example
# t = np.linspace(0, 10, 11)  # Original time points
# prices = np.random.normal(100, 5, len(t))  # Simulated price series
# sigma = np.std(prices)  # Variance of the series

# # Interpolate to finer grid
# t_new = np.linspace(0, 10, 21)  # New time points
# prices_new = np.zeros(len(t_new))

# # Copy original points and interpolate gaps
# for i in range(len(t) - 1):
#     idx = np.where((t_new >= t[i]) & (t_new <= t[i+1]))[0]
#     for j in idx:
#         prices_new[j] = brownian_bridge(t_new[j], t[i], t[i+1], prices[i], prices[i+1], sigma)

# # Verify variance
# print("Original variance:", np.var(prices))
# print("Interpolated variance:", np.var(prices_new))

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt
# from scipy.interpolate import CubicSpline

# def brownian_bridge(t, t0, t1, x0, x1, sigma):
#     """Generate points using Brownian bridge between (t0, x0) and (t1, x1)."""
#     dt = t1 - t0
#     mu = x0 + (x1 - x0) * (t - t0) / dt  # Linear interpolation for mean
#     variance = sigma**2 * (t1 - t) * (t - t0) / dt
#     return mu + np.random.normal(0, np.sqrt(variance))

# # Generate original price series
# t = np.linspace(0, 10, 11)  # Original time points
# prices = np.random.normal(100, 5, len(t))  # Simulated price series
# sigma = np.std(prices)  # Standard deviation for Brownian bridge

# # Interpolate to finer grid
# t_new = np.linspace(0, 10, 21)  # New time points
# prices_new = np.zeros(len(t_new))  # Brownian bridge interpolation
# prices_lin = np.zeros(len(t_new))  # Linear interpolation
# prices_spl = np.zeros(len(t_new))  # Spline interpolation

# # Brownian bridge interpolation
# for i in range(len(t) - 1):
#     idx = np.where((t_new >= t[i]) & (t_new <= t[i+1]))[0]
#     for j in idx:
#         prices_new[j] = brownian_bridge(t_new[j], t[i], t[i+1], prices[i], prices[i+1], sigma)

# # Linear interpolation
# prices_lin = np.interp(t_new, t, prices)

# # Spline interpolation
# spline = CubicSpline(t, prices)
# prices_spl = spline(t_new)

# # Verify variances
# print("Original variance:", np.var(prices))
# print("Brownian bridge variance:", np.var(prices_new))
# print("Linear interpolation variance:", np.var(prices_lin))
# print("Spline interpolation variance:", np.var(prices_spl))

# # Plotting
# plt.figure(figsize=(10, 6))
# plt.plot(t, prices, 'o-', label='Original Prices', markersize=8)
# plt.plot(t_new, prices_new, 'x-', label='Brownian Bridge Interpolation')
# plt.plot(t_new, prices_lin, 's-', label='Linear Interpolation')
# plt.plot(t_new, prices_spl, 'd-', label='Spline Interpolation')
# plt.title('Price Series Interpolation Comparison')
# plt.xlabel('Time')
# plt.ylabel('Price')
# plt.grid(True)
# plt.legend()

# plt.show()

In [None]:
# # Graph correlations
# import seaborn as sns
# import matplotlib.pyplot as plt

# numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in cols_to_exclude]

# # drop columns with missing values
# corr_df = filtered[numerical_cols].corr()
# # corr_df.dropna(axis=1, how='all', inplace=True)
# # corr_df.dropna(axis=0, how='all', inplace=True)

# plt.figure(figsize=(50, 50))
# sns.heatmap(corr_df, cmap='coolwarm')
# plt.show()

---
### Prep historical data
---

In [None]:
stop
# # Test day gap
# dfs = {}
# for file in os.listdir(data_path):
#     symbol = os.path.splitext(file)[0].split('-')[0]
#     if symbol in verified_files:
#         dfs[symbol] = pd.read_csv(data_path + file)

# days, nums, lens, firsts = [], [], [], []
# for day in range(5,30):
#     days.append(day)

#     melted_dfs = []
#     expected_returns = {}
#     for symbol, df in tqdm(dfs.items(), total=len(dfs), desc=f'{day}'):
#         df = melt(df)
#         df['date'] = pd.to_datetime(df['date'])

#         latest_date = df['date'].iloc[-1]
#         earliest_date = df['date'].iloc[0]
#         length_required = pd.to_datetime('2020-02-01')
#         month_ago = datetime.today() - timedelta(days=30)

#         dates = df['date'].unique()
#         date_diffs = dates[1:] - dates[:-1]
        
#         if latest_date >= month_ago and earliest_date <= length_required and not (date_diffs > pd.Timedelta(days=day)).any():
#             df['symbol'] = symbol
#             df['pct_change'] = df['value'].pct_change()
#             expected_returns[symbol] = df['pct_change'].mean()
#             melted_dfs.append(df)
#     # print(f'Loaded {len(melted_dfs)} out of {len(file_list)} series ({round(len(melted_dfs)/len(file_list)*100, 4)}%)')

#     # Concatenate and pivot data
#     returns_df = pd.concat(melted_dfs, ignore_index=True)
#     returns_df = returns_df.pivot(index=['date', 'kind'], columns='symbol', values='pct_change')
#     returns_df = returns_df.sort_values(by=['date', 'kind'], ascending=[True, False]).reset_index().dropna()
#     lens.append(len(returns_df))
#     nums.append(len(returns_df.columns))
#     firsts.append(returns_df.date.iloc[0])

# gap_data_df = pd.DataFrame({
#     'day_gap': days,
#     'num_etfs': nums,
#     'period_length': lens,
#     'first_day':firsts})

# gap_data_df

In [None]:
# Load and prepare historical training data
# def melt(data_df, value_columns=None):
#     if not value_columns:
#         value_columns = ['open', 'close']
#     id_columns = [col for col in data_df.columns.to_list() if col not in value_columns]
#     melted_df = data_df.melt(id_vars=id_columns, value_vars=value_columns, var_name='kind', value_name='value')
#     return melted_df.sort_values(by=['date'], ascending=[True, False]).reset_index(drop=True)

# Load historical data and merge them all into one df
dfs = {}
file_list = os.listdir(data_path)
for file in file_list:
    symbol = os.path.splitext(file)[0].split('-')[0]
    if symbol in verified_files:
        dfs[symbol] = pd.read_csv(data_path + file)


# Melt dfs, filters, and calc pct_change. ASSUMES that dfs are sorted chronologically
training_start_date = pd.to_datetime('2020-02-01')
month_ago = datetime.today() - timedelta(days=31)

day_gap = 6 # SET ACCEPTABLE DAY GAP

melted_dfs, expected_returns = [], {}
for symbol, df in tqdm(dfs.items(), total=len(dfs), desc=f'Filtering {kind} dfs'):
    df['date'] = pd.to_datetime(df['date'])

    latest_date = df['date'].iloc[-1]
    earliest_date = df['date'].iloc[0]
    dates = df['date'].unique()
    date_gaps = dates[1:] - dates[:-1]
    
    if (kind == 'indices') or (latest_date >= month_ago and earliest_date <= training_start_date and (date_gaps <= pd.Timedelta(days=day_gap)).all()):
        df['symbol'] = symbol
        df['pct_change'] = df['average'].pct_change()
        expected_returns[symbol] = df['pct_change'].mean()
        melted_dfs.append(df)
print(f'Loaded {len(melted_dfs)} out of {len(file_list)} series ({round(len(melted_dfs)/len(file_list)*100, 4)}%)')

# Concatenate and pivot data
returns_df = pd.concat(melted_dfs, ignore_index=True)
returns_df = returns_df.pivot(index=['date'], columns='symbol', values='pct_change')
returns_df = returns_df.sort_values(by=['date'], ascending=[True]).reset_index()

# Define training boundaries
training_cutoff_date = datetime.today() - timedelta(days=365)
training_df = returns_df[returns_df['date'] <= training_cutoff_date]
training_matrix = training_df.drop(['date'], axis=1).dropna().copy()

In [None]:
# Calculate risk-free-rate for training window
treasury_rate = web.DataReader('DGS10', 'fred', training_cutoff_date-timedelta(days=365), training_cutoff_date)
nominal_rf_rate = treasury_rate.mean() / 100

fred = Fred(api_key='30ae0e4e7713662116edf836cec71562')
cpi_data = fred.get_series('CPIAUCSL', training_cutoff_date-timedelta(days=365), training_cutoff_date) # CPI
inflation_rate = (cpi_data.iloc[-1] - cpi_data.iloc[0]) / cpi_data.iloc[0]

real_rf_rate = (1 + nominal_rf_rate) / (1 + inflation_rate) - 1

In [None]:
# Calculate corr and cov for historical training data
training_array = training_matrix.values # Convert training matrix to numpy array
symbol_list = training_matrix.columns.tolist()
num_symbols = len(symbol_list)
corr_matrix = np.zeros((num_symbols, num_symbols)) # Pre-allocate numpy array for correlation
cov_matrix = np.zeros((num_symbols, num_symbols))  # Pre-allocate numpy array for covariance

for i, sym_i in tqdm(enumerate(symbol_list), total=num_symbols, desc=f"Calculating distance stats sqr"):
    for j, sym_j in enumerate(symbol_list):
        if i <= j:  # Compute only for upper triangle (including diagonal)
            stats = dcor.distance_stats(training_array[:, i], training_array[:, j])
            corr_value = stats.correlation_xy
            cov_value = stats.covariance_xy

            corr_matrix[i, j] = corr_value
            corr_matrix[j, i] = corr_value  # Fill symmetric value

            cov_matrix[i, j] = cov_value
            cov_matrix[j, i] = cov_value  # Fill symmetric value

corr_df = pd.DataFrame(corr_matrix, index=symbol_list, columns=symbol_list) # Convert numpy array back to df for output
cov_df = pd.DataFrame(cov_matrix, index=symbol_list, columns=symbol_list)   # Convert numpy array back to df for output

corr_df.to_csv(f'{root}corr_df.csv', index=False)
cov_df.to_csv(f'{root}cov_df.csv', index=False)

---
### Compute etf combinations based on optimal k_clusters
---

In [None]:
# Load corr and cov
corr_df = pd.read_csv(f'{root}corr_df.csv')
cov_df = pd.read_csv(f'{root}cov_df.csv')
symbol_list = corr_df.columns

symbol2index = dict(zip(corr_df.columns, corr_df.index))
index2symbol = dict(zip(corr_df.index, corr_df.columns))
corr_df.rename(columns=symbol2index, inplace=True)
cov_df.rename(columns=symbol2index, inplace=True)

distance_matrix = (1 - corr_df).to_numpy()
np.fill_diagonal(distance_matrix, 0)

In [None]:
# Thresholds / cluster_num graphs
methods = ['single', 'ward', 'average', 'complete', 'weighted', 'centroid', 'median']
methods = ['ward']
for method in methods:
    linked = sch.linkage(squareform(distance_matrix), method=method)
    
    num_clusters = range(len(corr_df), 1, -1)
    thresholds = linked[:, 2]

    # inertias = []
    # for n_clusters in num_clusters:
    #     cluster_labels = fcluster(linked, t=n_clusters, criterion='maxclust')
    #     inertia = 0
    #     for cluster in np.unique(cluster_labels):
    #         members = distance_matrix.values[cluster_labels == cluster]
    #         centroid = members.mean(axis=0)  # Cluster centroid
    #         inertia += np.sum((members - centroid) ** 2)
    #     inertias.append(inertia)

    # plt.figure(figsize=(12, 6))
    # plt.plot(num_clusters, inertias, marker='o', label=f"Method {method}")
    # plt.title(f"Inertia/Num ({method})")
    # plt.xlabel('Number of Clusters')
    # plt.ylabel('Inertia (Sum of Squared Distances)')
    # plt.grid(True)
    # plt.legend()
    # plt.show()

    plt.figure(figsize=(12, 6))
    plt.plot(num_clusters, thresholds, marker='o')
    plt.title(f"Threshold/Num ({method})")
    plt.xlabel('Number of Clusters')
    plt.ylabel('Threshold (Distance)')
    plt.grid(True)
    plt.show()


In [None]:
# Silhouettes and dendrograms
def product(row):
    product = 1
    for value in row.values():
        product *= value
    return product

ks = []
scores = []
counts = []
for k in range(2, min(len(distance_matrix), 9)):
    clusters = AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(distance_matrix)
    score = silhouette_score(distance_matrix, clusters, metric='precomputed')
    ks.append(k)
    scores.append(score)
    unique_clusters, label_counts = np.unique(clusters, return_counts=True)
    label_counts_dict = dict(zip(unique_clusters, label_counts))
    counts.append(label_counts_dict)

silhouettes = pd.DataFrame({
    'k': ks,
    'score': scores,
    'counts': counts
})
silhouettes['combitions'] = silhouettes['counts'].apply(product)
silhouettes = silhouettes.sort_values(by='score', ascending=False)
best_k = silhouettes.k.iloc[0]

# best_k = 3

display(silhouettes)
methods = ['single', 'ward', 'average', 'complete', 'weighted', 'centroid', 'median']
methods = ['ward']
for method in methods:
    # Now compute the linkage using a condensed distance matrix
    linked = sch.linkage(squareform(distance_matrix), method=method)
    plt.figure(figsize=(20, 10))
    sch.dendrogram(linked, labels=corr_df.index, leaf_rotation=90)
    plt.title(f"Method {method}")
    plt.show()

---
### Calculate Minimum Variance Portfolios
---

In [None]:
# Portfolio Optimization Functions
def portfolio_variance(weights, cov_matrix):
    return weights.T @ cov_matrix @ weights

def portfolio_expected_return(weights, expected_returns_arr):
    return weights @ expected_returns_arr

def minimize_portfolio_variance(cov_matrix, expected_returns_arr):
    num_assets = len(cov_matrix)
    initial_weights = np.array([1/num_assets] * num_assets)
    bounds = tuple((0, 1) for _ in range(num_assets))
    constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})

    optimization_result = minimize(portfolio_variance,
                                    initial_weights,
                                    args=(cov_matrix,),
                                    method='SLSQP',
                                    bounds=bounds,
                                    constraints=constraints)

    if optimization_result.success:
        optimized_weights = optimization_result.x
        port_variance = optimization_result.fun
        port_std = np.sqrt(port_variance)
        port_er = portfolio_expected_return(optimized_weights, expected_returns_arr)

        return (optimized_weights, port_std, port_er)
    else:
        return (np.nan, np.nan, np.nan)

In [None]:
# Portfolio Optimization Functions
def compute_distance_sum(combination, distance_matrix):
    distance_sum = 0
    for i_idx, j_idx in itertools.combinations(combination, 2):
        distance_sum += distance_matrix[i_idx, j_idx]
    return distance_sum

def portfolio_variance(weights, cov_matrix):
    return weights.T @ cov_matrix @ weights

def portfolio_expected_return(weights, expected_returns_arr):
    return weights @ expected_returns_arr

def sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate):
    port_er = portfolio_expected_return(weights, expected_returns_arr)
    port_variance = portfolio_variance(weights, cov_matrix)
    port_std = np.sqrt(port_variance)
    return (port_er - risk_free_rate) / port_std

def negative_sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate):
    return -sharpe_ratio(weights, expected_returns_arr, cov_matrix, risk_free_rate)

def find_tangency_portfolio(cov_matrix, expected_returns_arr, risk_free_rate):
    num_assets = len(cov_matrix)
    initial_weights = np.array([1/num_assets] * num_assets)
    bounds = tuple((0, 1) for _ in range(num_assets))
    constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})

    optimization_result = minimize(negative_sharpe_ratio,
                                    initial_weights,
                                    args=(expected_returns_arr, cov_matrix, risk_free_rate),
                                    method='SLSQP',
                                    bounds=bounds,
                                    constraints=constraints)

    if optimization_result.success:
        optimized_weights = optimization_result.x
        variance = portfolio_variance(optimized_weights, cov_matrix)
        std = np.sqrt(variance)
        er = portfolio_expected_return(optimized_weights, expected_returns_arr)
        sharpe_ratio = -(optimization_result.fun)

        return (optimized_weights, std, er, sharpe_ratio)
    else:
        return (np.nan, np.nan, np.nan, np.nan)
    
def sort_top_combinations(array, sort_index):
    valid_rows = ~np.isnan(array[:, sort_index])
    valid_array = array[valid_rows]
    if valid_array.size > 0:
        sort_values = valid_array[:, sort_index]
        sort_indices = np.argsort(sort_values)[::-1]
        array[valid_rows] = valid_array[sort_indices]
    return array

In [None]:
num_symbols = len(corr_df.index)
num_metrics = best_k*2 + 4
num_combinations = math.comb(num_symbols, best_k)
combination_array = np.empty((num_combinations, num_metrics), dtype='float32')

# Calculate distance sums and populate the NumPy array
for i, combination in tqdm(enumerate(itertools.combinations(range(0,num_symbols), best_k)), total=num_combinations, desc="Calculating distance sums"):
    combination_array[i, :best_k] = combination
    combination_cov_df = cov_df.loc[combination, combination]
    combination_expected_returns = np.array([expected_returns[index2symbol[index]] for index in combination])

    index_indicator = best_k + best_k + 3
    combination_array[i, best_k: index_indicator] = find_tangency_portfolio(combination_cov_df, combination_expected_returns, real_rf_rate)
    combination_array[i, index_indicator: index_indicator + 1] = compute_distance_sum(combination, distance_matrix)

    # population growth rate



# TODO - not to be sorted by best_k
sorted_indices = np.argsort(combination_array[:, best_k], kind='mergesort')[::-1]
combination_array = combination_array[sorted_indices]
del sorted_indices
combination_array, len(combination_array)

In [None]:
num_symbols = len(corr_df.index)
num_metrics = best_k*2 + 4
num_combinations_possible = math.comb(num_symbols, best_k)

top_n = 5000 # Define how many top combinations to keep

top_combinations_array = np.empty((top_n, num_metrics), dtype='float32')
top_combinations_array[:] = np.nan
rows_filled = 0


for combination_tuple in tqdm(itertools.combinations(range(0,num_symbols), best_k), total=num_combinations_possible, desc="Calculating Tangency Portfolios"):
    combination_cov_df = cov_df.loc[combination_tuple, combination_tuple]
    combination_expected_returns = np.array([expected_returns[index2symbol[index]] for index in combination_tuple])
    weights, std, er, sharpe = find_tangency_portfolio(combination_cov_df, combination_expected_returns, real_rf_rate)
    rating = sharpe * compute_distance_sum(combination_tuple, distance_matrix)

    if rows_filled < top_n:
        top_combinations_array[rows_filled, :best_k] = combination_tuple
        top_combinations_array[rows_filled, best_k:best_k*2] = weights
        top_combinations_array[rows_filled, best_k*2: num_metrics] = [std, er, sharpe, rating]
        rows_filled += 1
        if rows_filled == top_n:
            top_combinations_array = sort_top_combinations(top_combinations_array, -1)

    else:
        if rating > top_combinations_array[-1, -1]:
            top_combinations_array[rows_filled-1, :best_k] = combination_tuple
            top_combinations_array[rows_filled-1, best_k:best_k*2] = weights
            top_combinations_array[rows_filled-1, best_k*2: num_metrics] = [std, er, sharpe, rating]
            top_combinations_array = sort_top_combinations(top_combinations_array, -1)


In [None]:
# Remove NaN rows before further processing
top_combinations_array_cleaned = top_combinations_array[~np.isnan(top_combinations_array[:, best_k])]

print("Top", top_n, "Combinations by Sharpe Ratio:")
for row in top_combinations_array_cleaned:
    combination_indices = row[:best_k].astype(int)
    asset_symbols = [index2symbol[index] for index in combination_indices]
    # asset_symbols = [index for index in combination_indices]
    weights, std, er, sharpe, rating = row[best_k:best_k+best_k], row[best_k+best_k], row[best_k+best_k+1], row[best_k+best_k+2], row[-1]
    print(f"Assets: {asset_symbols}, Weights: {weights}, Std Dev: {std:.4f}, Expected Return: {er:.4f}, Sharpe Ratio: {sharpe:.4f}, Rating: {rating:.4f}")