In [None]:
import os
from ib_async import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from tqdm.auto import tqdm
from scipy.optimize import minimize
import pandas_datareader.data as web
import re
import ast


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
# Prep functions
def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

In [None]:
# kind = 'midpoint'
kind = 'trades'
# kind = 'indices'

if kind == 'midpoint':
    root = 'data/daily-midpoint/'
elif kind == 'trades':
    root = 'data/daily-trades/'
elif kind == 'indices':
    root = 'data/indices/'

data_path = root + 'series/'
verified_path = root + 'verified_files.csv'

if kind in ['trades', 'indices']:
    price_col = 'average'
else:
    price_col = 'close'

In [None]:
# Verify files
fund_df = load('data/fundamentals.csv')

try:
    verified_df = pd.read_csv(verified_path)
except FileNotFoundError:
    util.startLoop()
    ib = IB()
    ib.connect('127.0.0.1', 7497, clientId=2)

    file_list = os.listdir(data_path)
    verified_files = []

    for file_name in tqdm(file_list, total=len(file_list), desc="Verifying files"):
        if not file_name.endswith('.csv'):
            continue
        try:
            symbol, exchange, currency = file_name.replace('.csv', '').split('-')
            symbol_data = fund_df[(fund_df['symbol'] == symbol) & (fund_df['currency'] == currency)]
            if symbol_data.empty:
                continue

            contract_details = ib.reqContractDetails(Stock(symbol, exchange, currency))
            if not contract_details:
                continue
            isin = contract_details[0].secIdList[0].value

            if symbol_data['isin'].iloc[0] != isin:
                continue

            instrument_name = symbol_data['longName'].iloc[0].replace('-', '').replace('+', '')
            leveraged = any(
                re.fullmatch(r'\d+X', word) and int(word[:-1]) > 1 or word.lower().startswith(('lv', 'lev'))
                for word in instrument_name.split()
            )
            if leveraged:
                continue

            verified_files.append({'symbol': symbol, 'currency': currency})
        except ValueError as e:
            print(f"Invalid filename format {file_name}: {e}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    verified_df = pd.DataFrame(verified_files)
    verified_df.to_csv(verified_path, index=False)

    ib.disconnect()

### Merge historical series with fundamentals

In [None]:
def ensure_series_types(df, price_col):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    for col in ['volume', price_col]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def validate_raw_prices(df, price_col):
    invalid_price_mask = df[price_col] <= 0
    inconsistent_mask = pd.Series(False, index=df.index)
    if 'low' in df.columns and 'high' in df.columns:
        inconsistent_mask = (df['low'] > df['high'])

    local_error_mask = invalid_price_mask | inconsistent_mask
    df = df[~local_error_mask].copy()
    
    return df

def handle_stale_periods(df, price_col, max_stale_days=5):
    stale_groups = (df[price_col].diff() != 0).cumsum()
    if stale_groups.empty:
        return df
    
    period_lengths = df.groupby(stale_groups)[price_col].transform('size')
    long_stale_mask = period_lengths > max_stale_days
    
    is_intermediate_stale_row = (stale_groups.duplicated(keep='first') & 
                             stale_groups.duplicated(keep='last'))
    
    rows_to_drop_mask = long_stale_mask & is_intermediate_stale_row
    df = df[~rows_to_drop_mask].copy()
        
    return df

In [None]:
# Load historical series
if 'copied' not in globals() or input('reload csvs? (y/n)').lower() == 'y':
    latest = (datetime.now() - timedelta(days=365 * 6))
    first_date = (datetime.now())
    meta = []
    file_list = os.listdir(data_path)
    for file in tqdm(file_list, total=len(file_list)):
        if not file.endswith('.csv'):
            continue
        
        parts = os.path.splitext(file)[0].split('-')
        symbol, exchange, currency = parts[0], parts[1], parts[2]
        if not ((verified_df['symbol'] == symbol) & (verified_df['currency'] == currency)).any():
            continue
        
        # Load and clean raw series
        try:
            df = load(data_path + file)
            df = ensure_series_types(df, price_col)
            df = validate_raw_prices(df, price_col)
            df = handle_stale_periods(df, price_col)
            # df = adjust_for_splits(df, price_col)

            df['pct_change'] = df[price_col].pct_change()
            if df['date'].max() > latest:
                latest = df['date'].max()
            if df['date'].min() < first_date:
                first_date = df['date'].min()

            meta.append({
                'symbol': symbol,
                'currency': currency,
                'exchange_api': exchange,
                'df': df[['date', price_col, 'volume', 'pct_change']],
            })
        except Exception as e:
            print(f"ERROR {file}: {e}")
            
    meta = pd.DataFrame(meta)
    copied = meta.copy()
    copied['df'] = copied['df'].apply(lambda x: x.copy()) 

In [None]:
# RESET
meta = copied.copy()
meta['df'] = copied['df'].apply(lambda x: x.copy())

In [None]:
def detect_and_nullify_global_outliers(meta_df, price_col, z_threshold=120.0, window=5):
    all_pct_changes = pd.concat(
        [row['df']['pct_change'] for _, row in meta_df.iterrows()],
        ignore_index=True
    ).dropna()
    all_pct_changes = all_pct_changes[~np.isinf(all_pct_changes) & (all_pct_changes != 0)]

    global_median_return = all_pct_changes.median()
    global_mad = (all_pct_changes - global_median_return).abs().median()

    outlier_series = {}
    # for idx, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
    for idx, row in meta_df.iterrows():
        df = row['df']
        df = df.reset_index(drop=True)
        if df['pct_change'].isnull().all():
            continue
        cols_to_null = [price_col, 'volume', 'high', 'low', 'pct_change']
        cols_to_null = [c for c in cols_to_null if c in df.columns]

        absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
        outlier_mask = absolute_modified_z > z_threshold
        
        if outlier_mask.any():
            data_dict = absolute_modified_z[outlier_mask].describe()

            candidate_indices = df.index[outlier_mask]
            for df_idx in candidate_indices:
                price_to_check_idx = df_idx - 1
                price_to_check = df.loc[price_to_check_idx, price_col]
                local_window_start = max(0, price_to_check_idx - window)
                local_window = df.loc[local_window_start : price_to_check_idx - 1, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std != 0: 
                    price_z_score = abs(price_to_check - local_mean) / local_std
                    if price_z_score > z_threshold / 10:
                        df.loc[price_to_check_idx, cols_to_null] = np.nan

                price_to_check = df.loc[df_idx, price_col]
                local_window_end = min(df_idx + window, df.index[outlier_mask].max())
                local_window = df.loc[df_idx + 1: local_window_end, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std != 0:
                    price_z_score = abs(price_to_check - local_mean) / local_std
                    if price_z_score > z_threshold / 10:
                        df.loc[df_idx, cols_to_null] = np.nan


            data_dict['new_length'] = len(df)
            outlier_series[row['symbol']] = data_dict
            
            df['pct_change'] = df[price_col].pct_change(fill_method=None)
            
            meta_df.at[idx, 'df'] = df

    # return outlier_series

z_threshold = 50
window = 5
# modified_series_info = detect_and_nullify_global_outliers(meta, price_col=price_col, z_threshold=z_threshold, window=window)
detect_and_nullify_global_outliers(meta, price_col=price_col, z_threshold=z_threshold, window=window)


In [None]:
# # Check global outliers ### NATURALLY PLOTS A FEW STRAGLERS
# def global_return_filter(meta_df, z_threshold=120.0):
#     all_pct_changes = pd.concat(
#         [row['df']['pct_change'] for _, row in meta_df.iterrows()],
#         ignore_index=True
#     )
#     all_pct_changes.dropna(inplace=True)
#     all_pct_changes = all_pct_changes[~np.isinf(all_pct_changes) & (all_pct_changes != 0)]

#     global_median_return = all_pct_changes.median()
#     global_mad = (all_pct_changes - global_median_return).abs().median()

#     outlier_series = {}
#     for _, row in tqdm(meta_df.iterrows(), total=len(meta_df)):
#         df = row['df']        
#         absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
#         if absolute_modified_z.max() > z_threshold:
#             outlier_series[row['symbol']] = absolute_modified_z.describe()

#     return outlier_series, global_mad, global_median_return

# z_threshold = 50
# globally_defective_symbols, global_mad, global_median_return = global_return_filter(meta, z_threshold=z_threshold)
# globally_defective_symbols = pd.DataFrame(globally_defective_symbols)

# meta_indexed = meta.set_index('symbol')
# for symbol in globally_defective_symbols.T.sort_values(by='max', ascending=True).index.tolist():
#     df = meta_indexed.loc[symbol, 'df'].copy()
#     df = df.reset_index(drop=True)

#     absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
#     outlier_mask = absolute_modified_z > z_threshold

#     corrected_outlier_mask = pd.Series(False, index=df.index)
#     for df_idx in df.index[outlier_mask]:
#         # Check data points before
#         price_to_check_idx = df_idx - 1
#         price_to_check = df.at[price_to_check_idx, price_col]
#         local_window_start = max(0, price_to_check_idx - window)
#         local_window = df.loc[local_window_start : price_to_check_idx - 1, price_col].dropna()
#         local_mean = local_window.mean()
#         local_std = local_window.std()
#         if local_std != 0:
#             price_z_score = abs(price_to_check - local_mean) / local_std
#             if price_z_score > z_threshold / 10:
#                 corrected_outlier_mask.at[price_to_check_idx] = True

#         # Check data points after
#         price_to_check = df.at[df_idx, price_col]
#         local_window_end = min(df_idx + window, df.index[outlier_mask].max())
#         local_window = df.loc[df_idx + 1: local_window_end, price_col].dropna()
#         local_mean = local_window.mean()
#         local_std = local_window.std()
#         if local_std != 0:
#             price_z_score = abs(price_to_check - local_mean) / local_std
#             if price_z_score > z_threshold / 10:
#                 corrected_outlier_mask.at[df_idx] = True

#     if corrected_outlier_mask.any():
#         # Plotting
#         plt.figure(figsize=(10, 6))
#         plt.plot(df['date'], df[price_col], marker='o', label='Normal')
#         plt.scatter(df.loc[corrected_outlier_mask, 'date'],
#                     df.loc[corrected_outlier_mask, price_col],
#                     color='red', label='Outlier', zorder=5)

#         plt.title(f"Symbol: {symbol}")
#         plt.xlabel("Date")
#         plt.ylabel(price_col)
#         plt.legend()
#         plt.tight_layout()
#         plt.show()

In [None]:
# # Delete duplicates ETF files
# duplicates = meta[meta.duplicated(subset=['symbol', 'currency'], keep=False)].copy()
# duplicates['not_smart'] = duplicates['exchange_api'] != 'SMART'
# duplicates['length'] = duplicates['df'].apply(len)

# sorted_duplicates = duplicates.sort_values(
#     by=['symbol', 'currency', 'length', 'not_smart'],
#     ascending=[True, True, False, False]
# )

# rows_to_keep = sorted_duplicates.groupby(['symbol', 'currency']).head(1)
# rows_to_delete = duplicates[~duplicates.index.isin(rows_to_keep.index)]
# for idx, row in rows_to_delete.iterrows():
#     file_name = f"{row['symbol']}-{row['exchange_api']}-{row['currency']}.csv"
#     file_path = os.path.join(data_path, file_name)
#     if os.path.exists(file_path):
#         os.remove(file_path)
#         print(f"Deleted {file_path}")
#     else:
#         print(f"File not found: {file_path}")

# del duplicates, sorted_duplicates, rows_to_keep, rows_to_delete

In [None]:
# Calculate series gap stats
while True:
    year_range = int(input('Year range (>= 2):'))
    if year_range >= 2:
        break

oldest = latest - pd.Timedelta(days=365 * year_range)
business_days = pd.date_range(start=oldest, end=latest, freq='B')

# Calculate statistics for each DataFrame in meta
for idx, row in tqdm(meta.iterrows(), total=len(meta)):
    df = row['df']
    merged = pd.merge(pd.DataFrame({'date': business_days}), df, on='date', how='left')
    
    # Calculate gaps
    present = merged[price_col].notna()
    present_idx = np.flatnonzero(present)
    gaps = []
    length = len(merged)

    if present_idx.size > 0:
        if present_idx[0] > 0:
            gaps.append(present_idx[0])
        if present_idx.size > 1:
            internal_gaps = np.diff(present_idx) - 1
            gaps.extend(gap for gap in internal_gaps if gap > 0)
        if present_idx[-1] < length - 1:
            gaps.append(length - 1 - present_idx[-1])
    else:
        gaps = [length]

    gaps = np.array(gaps, dtype=int)
    gaps = gaps[gaps > 0]
    max_gap = float(gaps.max()) if gaps.size > 0 else 0.0
    std_gap = float(gaps.std()) if gaps.size > 0 else 0.0
    missing = length - present.sum()
    pct_missing = missing / length

    # Update meta with statistics
    meta.at[idx, 'df'] = merged
    meta.at[idx, 'max_gap'] = max_gap
    meta.at[idx, 'missing'] = missing
    meta.at[idx, 'pct_missing'] = pct_missing

print(f'Latest: {latest}')
print(f'Oldest: {oldest}')

In [None]:
# Remove series with large day gaps
meta['max_gap_log'] = np.log1p(meta['max_gap'])

condition = ((meta['max_gap_log'] < meta['max_gap_log'].mean()) & 
             (meta['pct_missing'] < meta['pct_missing'].mean()))

# HARD-CODED 3y window mean stats
max_gap_log = 3.0415511502218044
max_pct_missing = 0.2915700460994459
condition = ((meta['max_gap_log'] < max_gap_log) & 
             (meta['pct_missing'] < max_pct_missing))

filtered = meta[condition].copy()

print(f'{len(filtered)} ETFs included')
print(f'{len(meta) - len(filtered)} dropped')
del meta

In [None]:
# Interpolate/extrapolate price column and merge with fund
for idx, row in tqdm(filtered.iterrows(), total=len(filtered)):
    df = row['df']
    df[price_col] = df[price_col].interpolate(method='akima', limit_direction='both')
    if df[price_col].isna().any():
        df[price_col] = df[price_col].ffill()
        df[price_col] = df[price_col].bfill()
    
    df['pct_change'] = df[price_col].pct_change()
    filtered.at[idx, 'df'] = df.set_index('date')

filtered = pd.merge(filtered, fund_df, on=['symbol', 'currency'], how='inner').drop(['max_gap', 'missing', 'pct_missing', 'max_gap_log'], axis=1)

del fund_df

In [None]:
# # Spectral preprocessing
# from statsmodels.tsa.stattools import adfuller
# from scipy.signal import welch
# import matplotlib.pyplot as plt

# def test_stationarity(series, signif=0.05, regression='c'):
#     try:
#         series = series.dropna()
#         adf_res = adfuller(series, regression=regression, autolag='AIC')
#         stat, pval, used_lags, nobs, crit_vals, icbest = adf_res
#         return {
#             'test_statistic': stat,
#             'p_value': pval,
#             'used_lags': used_lags,
#             'nobs': nobs,
#             'critical_values': crit_vals,
#             'is_stationary': (pval < signif),
#             'icbest': icbest,
#         }
#     except Exception as e:
#         print(f'{e}')
#         return {
#             'test_statistic': np.nan,
#             'p_value': np.nan,
#             'used_lags': np.nan,
#             'nobs': np.nan,
#             'critical_values': np.nan,
#             'is_stationary': np.nan,
#             'icbest': np.nan,
#         }

# def demean(series):
#     mu = series.mean()
#     return series - mu

# def apply_window(series, window_type='hanning'):
#     x = series.values if hasattr(series, 'values') else np.asarray(series)
#     N = len(x)
    
#     window_map = {
#         'hanning': np.hanning,
#         'hamming': np.hamming,
#         'bartlett':np.bartlett,
#         'blackman':np.blackman,
#     }
    
#     w = window_map[window_type](N)
#     return x * w

# def compute_welch_psd(series, fs=1.0, window='hann', nperseg=256, noverlap=None):
#     if noverlap is None:
#         noverlap = nperseg // 2
#     freqs, psd = welch(
#         series.dropna().values,
#         fs=fs,
#         window=window,
#         nperseg=nperseg,
#         noverlap=noverlap,
#         detrend=False,
#         scaling='density'
#     )
#     return freqs, psd


# def analyze_spectrum_and_extract_features(series, fs=1.0, nperseg=256):
#     series = series.dropna()
#     if len(series) < nperseg:
#         raise

#     stationarity_info = test_stationarity(series)
#     if not stationarity_info['is_stationary']:
#         return {'is_stationary': False}

#     demeaned_series = demean(series)
#     try:
#         freqs, psd = welch(
#             demeaned_series,
#             fs=fs,
#             window='hann',
#             nperseg=nperseg,
#             noverlap=nperseg // 2,
#             detrend=False,
#             scaling='density'
#         )
#     except ValueError as e:
#         print(e)
#         print('welch')
#         return None

#     if len(freqs) < 2:
#         print('small freqs')
#         return None
        
#     non_zero_freq_mask = freqs > 0
#     freqs = freqs[non_zero_freq_mask]
#     psd = psd[non_zero_freq_mask]

#     if len(psd) == 0:
#         print('small psd')
#         return None

#     # Dominant cycle: Frequency with the highest power
#     dominant_freq_idx = np.argmax(psd)
#     dominant_freq = freqs[dominant_freq_idx]
#     dominant_freq_power = psd[dominant_freq_idx]
#     dominant_period_days = 1 / dominant_freq if dominant_freq != 0 else np.inf # Convert frequency to period in days

#     # A high ratio suggests a strong, clear cycle. A low ratio suggests noise.
#     spectral_concentration = dominant_freq_power / np.mean(psd)

#     return {
#         'is_stationary': True,
#         'dominant_freq': dominant_freq,
#         'dominant_period_days': dominant_period_days,
#         'dominant_freq_power': dominant_freq_power,
#         'spectral_concentration': spectral_concentration,
#     }

# spectral_features = []
# for idx, row in tqdm(filtered.iterrows(), total=len(filtered)):
#     df = row['df']
#     returns = df['pct_change'].dropna()
    
#     features = analyze_spectrum_and_extract_features(returns, nperseg=256)
#     if features:
#         features['conId'] = row['conId']
#         spectral_features.append(features)

# spectral_df = pd.DataFrame(spectral_features)
# if not spectral_df.empty:
#     filtered = pd.merge(filtered, spectral_df, on=['conId'], how='left')

# # You can now use these new columns for further filtering.
# # For example, you might want to investigate or exclude series that are:
# # - Non-stationary (is_stationary == False)
# # - Have a very low spectral concentration (indicating they are mostly noise)
# # - Have a dominant period that seems suspicious (e.g., exactly 7 days for a business-day series)


# Plot asset class portfolios

In [None]:
# Risk-free series calculation
import pandas_datareader.data as web

# 3-month bill/interest rate tickers (FRED/OECD) for each country
tickers = {
    'US': 'DTB3',
    'Canada': 'IR3TIB01CAM156N',
    'Germany': 'IR3TIB01DEM156N',
    'UK': 'IR3TIB01GBM156N',
    'France': 'IR3TIB01FRA156N',
}

# Fetch each series and convert from percentage to decimal
bonds = {}
failed = []
for country, ticker in tickers.items():
    try:
        series = web.DataReader(ticker, 'fred', oldest, latest)
        bonds[country] = series / 100.0
    except Exception:
        try:
            series = web.DataReader(ticker, 'oecd', oldest, latest)
            bonds[country] = series / 100.0
        except Exception as oecd_err:
            failed.append(country)

# Combine into a single DataFrame
df_bonds = pd.concat(bonds, axis=1)
df_bonds.columns = [c for c in tickers if c not in failed]
df_bonds = df_bonds.interpolate(method='akima').bfill().ffill()

risk_free_df = df_bonds.mean(axis=1).rename('nominal_rate')
risk_free_df = risk_free_df.reindex(business_days, copy=False)

risk_free_df = pd.DataFrame(risk_free_df)
risk_free_df['daily_nominal_rate'] = risk_free_df['nominal_rate'] / 252

print(f'Short-term bonds used from: {df_bonds.columns.to_list()}')

In [None]:
# Add pct_change cols to dfs and create pct_changes
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in ['conId']]

pct_changes = pd.concat(
        [row['df']['pct_change'].rename(row['conId']) 
        for _, row in filtered.iterrows()], axis=1
    )

# Remove uninformative cols for market portfolios 
uninformative_cols = [col for col in numerical_cols if filtered[col].nunique(dropna=True) <= 1]
filtered = filtered.drop(columns=uninformative_cols)
filtered = filtered.dropna(axis=1, how='all')

In [None]:
# Add rate of change fundamentals
def calculate_slope(value1, value2, time1, time2):
    return (value1 - value2) / (time1 - time2)


rate_fundamentals = [('EPSGrowth-1yr', 'EPS_growth_3yr', 'EPS_growth_5yr'),
                     ('ReturnonAssets1Yr', 'ReturnonAssets3Yr'),
                     ('ReturnonCapital', 'ReturnonCapital3Yr'),
                     ('ReturnonEquity1Yr', 'ReturnonEquity3Yr'),
                     ('ReturnonInvestment1Yr', 'ReturnonInvestment3Yr')]

for cols in rate_fundamentals:
    base_name = cols[0].replace('-1yr', '').replace('1Yr', '')
    slope_col = f'fundamentals_{base_name}_slope'
    
    if len(cols) == 3:
        col_1yr, col_3yr, col_5yr = cols

        filtered[slope_col] = calculate_slope(
            filtered[f'fundamentals_{col_1yr}'],
            filtered[f'fundamentals_{col_5yr}'],
            1, 5
        )

        if 'EPS' in base_name:
            slope_1yr_3yr = calculate_slope(
                filtered[f'fundamentals_{col_1yr}'],
                filtered[f'fundamentals_{col_3yr}'],
                1, 3
            )
            slope_3yr_5yr = calculate_slope(
                filtered[f'fundamentals_{col_3yr}'],
                filtered[f'fundamentals_{col_5yr}'],
                3, 5
            )
            
            second_deriv_col = f'fundamentals_{base_name}_second_deriv'
            filtered[second_deriv_col] = calculate_slope(
                slope_1yr_3yr,
                slope_3yr_5yr,
                1, 3
            )
    elif len(cols) == 2:
        col_1yr, col_3yr = cols
        filtered[slope_col] = calculate_slope(
            filtered[f'fundamentals_{col_1yr}'],
            filtered[f'fundamentals_{col_3yr}'],
            1, 3
        )

# Add new cols to numericals
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in ['conId']]

In [None]:
# Return stats and split training and tests sets
def get_return_stats(df, training_cutoff, momentum_cutoffs, risk_free_df):
    training_df = df[df.index < training_cutoff]
    training_rf = risk_free_df[risk_free_df.index < training_cutoff]

    excess_returns = training_df['pct_change'] - training_rf['daily_nominal_rate']
    sharpe = excess_returns.mean() / excess_returns.std()
    # avg_volume = training_df['volume'].mean()

    momentum_3mo = training_df[training_df.index >= momentum_cutoffs['3mo']]['pct_change'].mean()
    momentum_6mo = training_df[training_df.index >= momentum_cutoffs['6mo']]['pct_change'].mean()
    momentum_1y  = training_df[training_df.index >= momentum_cutoffs['1y']]['pct_change'].mean()

    return pd.Series(
        [momentum_3mo, momentum_6mo, momentum_1y, sharpe],#, avg_volume],
        index=['momentum_3mo', 'momentum_6mo', 'momentum_1y', 'stats_sharpe']#, 'stats_avg_volume']
    )

final_20_pct = (latest-oldest).days//5
training_cutoff = latest - pd.Timedelta(days=final_20_pct)
momentum_cutoffs = {
    '1y':  training_cutoff - pd.Timedelta(days=365),
    '6mo': training_cutoff - pd.Timedelta(days=365 // 2),
    '3mo': training_cutoff - pd.Timedelta(days=365 // 4),
}

# Apply to each row
# filtered[['momentum_3mo', 'momentum_6mo', 'momentum_1y', 'stats_sharpe', 'stats_avg_volume']] = filtered['df'].apply(lambda df: get_return_stats(df, training_cutoff, momentum_cutoffs, risk_free_df))
filtered[['momentum_3mo', 'momentum_6mo', 'momentum_1y', 'stats_sharpe']] = filtered['df'].apply(lambda df: get_return_stats(df, training_cutoff, momentum_cutoffs, risk_free_df))


In [None]:
# Create all asset type indices/portfolios
import matplotlib.pyplot as plt

holding_cols = [col for col in filtered.columns if col.startswith('holding_') and col != 'holding_types_variety'] + ['total']
portfolio_dfs = {}

for holding_col in holding_cols:
    name = holding_col.split('_')[-1]
    if holding_col == 'total':
        weight = filtered['profile_cap_usd']
    else:
        weight = (filtered['profile_cap_usd'] * filtered[holding_col])
 
    total_market_cap = (weight).sum()
    filtered['weight'] = weight / total_market_cap
    
    weights = filtered.set_index('conId')['weight']
    portfolio_return = pct_changes.dot(weights)
    initial_price = 1
    portfolio_price = initial_price * (1 + portfolio_return.fillna(0)).cumprod()

    portfolio_df = pd.DataFrame({
        'date': portfolio_price.index,
        price_col: portfolio_price.values,
        'pct_change': portfolio_return.values
    }).set_index('date')

    portfolio_dfs[name] = portfolio_df

    plt.figure(figsize=(10, 6))
    plt.title(f'{name.capitalize()} portfolio  -  ${format(total_market_cap, ',.0f')}')
    plt.plot(portfolio_df.index, portfolio_df[price_col], marker='o')
    plt.show()

filtered.drop('weight', axis=1, inplace=True)

In [None]:
# # Manual plot
# symbol_test = 'SHV'
# x = filtered[filtered['symbol'] == symbol_test].df.iloc[0].index
# y = filtered[filtered['symbol'] == symbol_test].df.iloc[0]['average']#.pct_change()

# y = risk_free_df['daily_nominal_rate']
# x = risk_free_df.index
# # y = df_bonds['UK']
# # x = df_bonds.index
# # y = portfolio_dfs['equity']['pct_change'] - risk_free_df['daily_nominal_rate']
# # x = portfolio_dfs['equity'].index

# plt.figure(figsize=(10, 6))
# plt.plot(x, y, marker='o')
# # plt.xlim(market_portfolio_df['date'].min(), market_portfolio_df['date'].max())
# plt.show()

In [None]:
# Avoid dummy trap
empty_subcategories = {
'holding_types': ['other'],
'countries': ['Unidentified'], 
'currencies': ['<NoCurrency>'],
'industries': ['NonClassifiedEquity', 'NotClassified-NonEquity'],
'top10': ['OtherAssets', 'AccountsPayable','AccountsReceivable','AccountsReceivable&Pay','AdministrationFees','CustodyFees','ManagementFees','OtherAssetsandLiabilities','OtherAssetslessLiabilities', 'OtherFees','OtherLiabilities','Tax','Tax--ManagementFees'],
'debtors': ['OTHER'],
'maturity': ['%MaturityOther'],
'debt_type': ['%QualityNotAvailable', '%QualityNotRated'],
'manual': ['asset_other']
}

dummy_trap_cols = []
for k, lst in empty_subcategories.items():
    for i in lst:
        if k == 'manual':
            dummy_trap_cols.append(i)
        else:
            dummy_trap_cols.append(f'{k}_{i}')
    
filtered = filtered.drop(columns=dummy_trap_cols, axis=1, errors='ignore')
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in ['conId']]

In [None]:
# Select asset types to work on
asset_conditions = {
    'equity': (filtered['asset_equity'] == 1),
    'cash': (filtered['asset_cash'] == 1),
    'bond': (filtered['asset_bond'] == 1),
    'other': (filtered['asset_equity'] == 0) & (filtered['asset_cash'] == 0) & (filtered['asset_bond'] == 0),
}

exclude_assets = ['bond']
asset_classes = list(asset_conditions.keys())

include_assets = [asset for asset in asset_classes if asset not in exclude_assets]
combined_condition = pd.Series(False, index=filtered.index)
for asset in include_assets:
    combined_condition |= asset_conditions[asset]

filtered_df = filtered[combined_condition]
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]

single_value_columns = [col for col in filtered_df.columns if col in numerical_cols and filtered_df[col].nunique() == 1]
asset_cols = [col for col in filtered_df if col.startswith('asset')]
filtered_df = filtered_df.drop(columns=single_value_columns + asset_cols)
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
if exclude_assets:
    print("Excluding assets:", ', '.join(exclude_assets))
else:
    print("No assets excluded.")

pct_changes = pct_changes[filtered_df['conId']]

In [None]:
# Select asset types to work on
from fuzzywuzzy import fuzz
import pandas as pd

asset_conditions = {
    'equity': (filtered['asset_equity'] == 1),
    'cash': (filtered['asset_cash'] == 1),
    'bond': (filtered['asset_bond'] == 1),
    'other': (filtered['asset_equity'] == 0) & (filtered['asset_cash'] == 0) & (filtered['asset_bond'] == 0),
}

asset_to_exclude = input("Assets to EXCLUDE (equity, bond, cash, other): ").lower().replace(',', ' ').split()

asset_classes = list(asset_conditions.keys())
exclude_assets = set()
for word in asset_to_exclude:
    scores = [(asset, fuzz.ratio(word, asset.lower())) for asset in asset_classes]
    best_asset, best_score = max(scores, key=lambda x: x[1])
    if best_score >= 70:
        exclude_assets.add(best_asset)

include_assets = [asset for asset in asset_classes if asset not in exclude_assets]
combined_condition = pd.Series(False, index=filtered.index)
for asset in include_assets:
    combined_condition |= asset_conditions[asset]

filtered_df = filtered[combined_condition]
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]

single_value_columns = [col for col in filtered_df.columns if col in numerical_cols and filtered_df[col].nunique() == 1]
asset_cols = [col for col in filtered_df if col.startswith('asset')]
filtered_df = filtered_df.drop(columns=single_value_columns + asset_cols)
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
if exclude_assets:
    print("Excluding assets:", ', '.join(exclude_assets))
else:
    print("No assets excluded.")

pct_changes = pct_changes[filtered_df['conId']]

### Country compression

In [None]:
# Rename to standard country strings
import country_converter as coco

cc = coco.CountryConverter()
country_cols = [col for col in filtered_df.columns if col.startswith('countries') and not col.endswith('variety')]

standard_names = set()
rename_map = {}
for col in country_cols:
    if col == 'countries_Unidentified':
        continue

    raw_name = col.replace('countries_', '')
    raw_name = ''.join(raw_name.split(' '))
    raw_name = ''.join([' ' + char if char.isupper() and i > 0 else char for i, char in enumerate(raw_name)]).strip()

    standard_name = cc.convert(names=raw_name, to='ISO3', not_found=None)
    standard_names.add(standard_name)
    if standard_name:
        rename_map[col] = f'countries_{standard_name}'
    else:
        print(f"Could not standardize: '{raw_name}' (from column '{col}')")

# Apply renaming
filtered_df.rename(columns=rename_map, inplace=True)

In [None]:
# MSCI classifications dict
msci_map = {
    # Developed Markets
    "Canada": "Developed",
    "USA": "Developed",
    "Austria": "Developed",
    "Belgium": "Developed",
    "Denmark": "Developed",
    "Finland": "Developed",
    "France": "Developed",
    "Germany": "Developed",
    "Ireland": "Developed",
    "Israel": "Developed",
    "Italy": "Developed",
    "Netherlands": "Developed",
    "Norway": "Developed",
    "Portugal": "Developed",
    "Spain": "Developed",
    "Sweden": "Developed",
    "Switzerland": "Developed",
    "UK": "Developed",
    "Australia": "Developed",
    "Hong Kong": "Developed",
    "Japan": "Developed",
    "New Zealand": "Developed",
    "Singapore": "Developed",
    "Luxembourg": "Developed",
    "Slovakia": "Developed",
    "Cyprus": "Developed",

    # Emerging Markets
    "Brazil": "Emerging",
    "Chile": "Emerging",
    "Colombia": "Emerging",
    "Mexico": "Emerging",
    "Peru": "Emerging",
    "Czech Republic": "Emerging",
    "Egypt": "Emerging",
    "Greece": "Emerging",
    "Hungary": "Emerging",
    "Kuwait": "Emerging",
    "Poland": "Emerging",
    "Qatar": "Emerging",
    "Saudi Arabia": "Emerging",
    "South Africa": "Emerging",
    "Turkey": "Emerging",
    "United Arab Emirates": "Emerging",
    "China": "Emerging",
    "India": "Emerging",
    "Indonesia": "Emerging",
    "Korea": "Emerging",
    "Malaysia": "Emerging",
    "Philippines": "Emerging",
    "Taiwan": "Emerging",
    "Thailand": "Emerging",
    "Bahamas": "Emerging",
    "Costa Rica": "Emerging",
    "Dominican Republic": "Emerging",
    "Mongolia": "Emerging",
    "Uruguay": "Emerging",
    "Barbados": "Emerging",

    # Frontier Markets
    "Bahrain": "Frontier",
    "Benin": "Frontier",
    "Burkina Faso": "Frontier",
    "Croatia": "Frontier",
    "Estonia": "Frontier",
    "Guinea-Bissau": "Frontier",
    "Iceland": "Frontier",
    "Ivory Coast": "Frontier",
    "Jordan": "Frontier",
    "Kazakhstan": "Frontier",
    "Kenya": "Frontier",
    "Latvia": "Frontier",
    "Lithuania": "Frontier",
    "Mali": "Frontier",
    "Mauritius": "Frontier",
    "Morocco": "Frontier",
    "Niger": "Frontier",
    "Oman": "Frontier",
    "Romania": "Frontier",
    "Senegal": "Frontier",
    "Serbia": "Frontier",
    "Slovenia": "Frontier",
    "Togo": "Frontier",
    "Tunisia": "Frontier",
    "Bangladesh": "Frontier",
    "Pakistan": "Frontier",
    "Sri Lanka": "Frontier",
    "Vietnam": "Frontier",

    # Standalone Markets
    "Argentina": "Standalone",
    "Jamaica": "Standalone",
    "Panama": "Standalone",
    "Trinidad and Tobago": "Standalone",
    "Bosnia and Herzegovina": "Standalone",
    "Bulgaria": "Standalone",
    "Lebanon": "Standalone",
    "Malta": "Standalone",
    "Nigeria": "Standalone",
    "Palestine": "Standalone",
    "Ukraine": "Standalone",
    "Russia": "Standalone",
    "Zimbabwe": "Standalone",
    "Venezuela": "Standalone",
    "Liechtenstein": "Standalone",
    "British Virgin Islands": "Standalone",
    "Faroe Islands": "Standalone",
    "Guernsey": "Standalone",
    "Cayman Islands": "Standalone",
    "Jersey": "Standalone",
    "Isle of Man": "Standalone",
    "Bermuda": "Standalone",
    "Monaco": "Standalone",
    "Macau": "Standalone",
    "Puerto Rico": "Standalone",
    "United States Virgin Islands": "Standalone",
}

msci_map = {
    cc.convert(names=code, to='ISO3', not_found=None): value
    for code, value in msci_map.items()
}

In [None]:
# Discrete GDP and pop functions
import wbgapi as wb

def create_continent_map(standard_names):
    continents = cc.convert(names=standard_names, to='continent', not_found=None)
    return {name: (cont if cont is not None else 'Other')
            for name, cont in zip(standard_names, continents)}

def create_metric_maps(standard_names, indicators, start_year, end_year, window_size=3):
    data = wb.data.DataFrame(list(indicators), standard_names, time=range(2000, end_year.year + 1), labels=False)
    data.dropna(axis=1, inplace=True) 

    yoy_change = data.diff(axis=1)
    first_div = yoy_change.T.rolling(window=window_size).mean().T
    
    yoy_change_first_div = first_div.diff(axis=1)
    second_div = yoy_change_first_div.T.rolling(window=window_size).mean().T

    latest_year_col = data.columns[-1]
    latest_first_div_col = first_div.columns[-1]
    latest_second_div_col = second_div.columns[-1]

    derivatives = pd.DataFrame(data[latest_year_col])
    derivatives.rename(columns={latest_year_col: 'raw_value'}, inplace=True)
    derivatives['1st_div'] = first_div[latest_first_div_col] / derivatives['raw_value']
    derivatives['2nd_div'] = second_div[latest_second_div_col] / derivatives['raw_value']
    
    metric_df_reshaped = derivatives.unstack(level='series')
    if isinstance(metric_df_reshaped.columns, pd.MultiIndex):
         metric_df_final = metric_df_reshaped.swaplevel(0, 1, axis=1)
         metric_df_final.sort_index(axis=1, level=0, inplace=True)
    else:
         metric_df_final = metric_df_reshaped

    return metric_df_final

In [None]:
# Create new country columns
indicator_name_map = {
    'NY.GDP.PCAP.CD': 'gdp_pcap',
    'SP.POP.TOTL': 'population',
}

continent_map = create_continent_map(standard_names)
metric_df = create_metric_maps(standard_names, indicator_name_map.keys(), oldest, latest)

metric_suffixes = {
    'raw_value': '_value',
    '1st_div': '_growth',
    '2nd_div': '_acceleration'
}

continents = list(continent_map.values())
# msci_groups = list(msci_map.values())

for cont in continents:
    filtered_df[f'continent_{cont}'] = 0.0
# for group in msci_groups:
#     filtered_df[f'msci_{group}'] = 0.0
for ind_code, ind_name in indicator_name_map.items():
    if ind_code in metric_df.columns.get_level_values(0):
        for metric_col, suffix in metric_suffixes.items():
            new_col_name = f'{ind_name}{suffix}'
            filtered_df[new_col_name] = 0.0

for std_name in standard_names:
    country_weight_col = f'countries_{std_name}'
    if country_weight_col not in filtered_df.columns:
        continue
    if std_name in continent_map:
        continent = continent_map[std_name]
        filtered_df[f'continent_{continent}'] += filtered_df[country_weight_col]        
    # if std_name in msci_map:
    #     market_group = msci_map[std_name]
    #     filtered_df[f'msci_{market_group}'] += filtered_df[country_weight_col]

    if std_name in metric_df.index:
        for ind_code, ind_name in indicator_name_map.items():
            if ind_code in metric_df.columns.get_level_values(0):
                for metric_col, suffix in metric_suffixes.items():
                    value = metric_df.loc[std_name, (ind_code, metric_col)]
                    target_col = f'{ind_name}{suffix}'
                    filtered_df[target_col] += filtered_df[country_weight_col] * value

# Drop single unique value columns
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
single_value_columns = [col for col in numerical_cols if filtered_df[col].nunique() == 1]
filtered_df = filtered_df.drop(columns=single_value_columns, errors='ignore')
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]

### Fundamentals compression

In [None]:
# Fundamentals reduced to factor columns
from sklearn.preprocessing import MinMaxScaler

fundamental_columns = [col for col in filtered_df.columns if col.startswith('fundamentals')]

value_columns_inverted = [
    'fundamentals_Price/Book',
    'fundamentals_Price/Cash',
    'fundamentals_Price/Earnings',
    'fundamentals_Price/Sales',
]

leverage_columns_inverted = [
    'fundamentals_LTDebt/Shareholders',
    'fundamentals_TotalDebt/TotalCapital',
    'fundamentals_TotalDebt/TotalEquity',
    'fundamentals_TotalAssets/TotalEquity',
]

profitability_columns = [
    'fundamentals_ReturnonAssets1Yr',
    'fundamentals_ReturnonAssets3Yr',
    'fundamentals_ReturnonCapital',
    'fundamentals_ReturnonCapital3Yr',
    'fundamentals_ReturnonEquity1Yr',
    'fundamentals_ReturnonEquity3Yr',
    'fundamentals_ReturnonInvestment1Yr',
    'fundamentals_ReturnonInvestment3Yr',
    # 'fundamentals_SalestoTotalAssets',
    # 'fundamentals_EBITtoInterest',
]

investment_columns = [
    'fundamentals_EPSGrowth-1yr',
    'fundamentals_EPS_growth_3yr',
    'fundamentals_EPS_growth_5yr',
    'fundamentals_EPSGrowth_slope',
    'fundamentals_EPSGrowth_second_deriv',
    'fundamentals_ReturnonAssets_slope',
    'fundamentals_ReturnonCapital_slope',
    'fundamentals_ReturnonEquity_slope',
    'fundamentals_ReturnonInvestment_slope',
]


momentum_columns = [
    'momentum_3mo',
    'momentum_6mo',
    'momentum_1y',
    'fundamentals_RelativeStrength'
]

columns_to_scale = value_columns_inverted + leverage_columns_inverted + profitability_columns + investment_columns + momentum_columns

if any(x in filtered_df.columns for x in columns_to_scale):
    scaler = MinMaxScaler()
    filtered_df[columns_to_scale] = scaler.fit_transform(filtered_df[columns_to_scale])

    # Value Score
    filtered_df['factor_value'] = (1 - filtered_df[value_columns_inverted]).sum(axis=1)
    filtered_df['factor_leverage'] = (1 - filtered_df[leverage_columns_inverted]).sum(axis=1)
    filtered_df['factor_profitability'] = filtered_df[profitability_columns].sum(axis=1)
    # filtered_df['factor_investment'] = filtered_df[investment_columns].sum(axis=1)
    filtered_df['factor_momentum'] = filtered_df[momentum_columns].sum(axis=1)

    # filtered_df = filtered_df.drop(columns=columns_to_scale, errors='ignore')

In [None]:
# Reorganize columns
categories = ['factor', 'holding_types', 'stats', 'momentum', 'profile', 'top10', 'population', 'msci', 'gdp', 'continent', 'countries', 'fundamentals', 'industries', 'currencies', 'debtors', 'maturity', 'debt_type', 'lipper', 'dividends', 'marketcap', 'style', 'domicile', 'asset']

numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
non_numerical = [col for col in filtered_df.columns if col not in numerical_cols]

for category in reversed(categories):
    cat_cols = [col for col in numerical_cols if col.startswith(category)]
    remaining = [col for col in numerical_cols if col not in cat_cols]
    numerical_cols = cat_cols + remaining

new_column_order = non_numerical + numerical_cols
filtered_df = filtered_df[new_column_order]

# Regression analysis

In [None]:
def construct_long_short_factor_returns(full_meta_df, returns_df, long_symbols, short_symbols, factor_column=None):
    long_df = full_meta_df[full_meta_df['conId'].isin(long_symbols)].set_index('conId')
    long_weights = long_df['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
    if long_weights.mean() == 0:
        print(f'Long {factor_column}')
        print(long_df.index)
        print()
    if factor_column:
        factor_weights = (full_meta_df[factor_column].max() - long_df[factor_column]) / (full_meta_df[factor_column].max() - full_meta_df[factor_column].min())
        factor_weights = factor_weights.reindex(returns_df.columns).fillna(0)
        if factor_weights.sum() != 0:
            long_weights *= factor_weights

    long_weights /= long_weights.sum()
    long_returns = returns_df.dot(long_weights)
    
    short_df = full_meta_df[full_meta_df['conId'].isin(short_symbols)].set_index('conId')
    short_weights = short_df['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
    if short_weights.mean() == 0:
        print(f'Short {factor_column}')
        print(short_df.index)
        print()
    if factor_column:
        factor_weights = (short_df[factor_column] - full_meta_df[factor_column].min()) / (full_meta_df[factor_column].max() - full_meta_df[factor_column].min())
        factor_weights = factor_weights.reindex(returns_df.columns).fillna(0)
        if factor_weights.sum() != 0:
            short_weights *= factor_weights

    short_weights /= short_weights.sum()
    short_returns = returns_df.dot(short_weights)
    
    factor_returns = long_returns - short_returns
    return factor_returns

In [None]:
def construct_factors(filtered_df, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=0.5, diffs=None):
    differences = []
    long = []
    short = []
    factors = {}
    # Market risk premium
    factors['factor_market_premium'] = (portfolio_dfs['equity']['pct_change'] - risk_free_df['daily_nominal_rate'])

    # SMB_ETF
    small_symbols = filtered_df[filtered_df['marketcap_small'] == 1]['conId'].tolist()
    large_symbols = filtered_df[filtered_df['marketcap_large'] == 1]['conId'].tolist()

    intersection = set(small_symbols) & set(large_symbols)
    small_symbols = [s for s in small_symbols if s not in intersection]
    large_symbols = [s for s in large_symbols if s not in intersection]
    smb_etf = construct_long_short_factor_returns(filtered_df, pct_changes, small_symbols, large_symbols)
    factors['factor_smb'] = smb_etf

    long.append(len(small_symbols))
    short.append(len(large_symbols))
    differences.append(np.abs(len(small_symbols) - len(large_symbols)))

    # HML_ETF
    # value_cols = [col for col in filtered_df.columns if col.startswith('style_') and col.endswith('value')]
    # growth_cols = [col for col in filtered_df.columns if col.startswith('style_') and col.endswith('growth')]
    # value_symbols = filtered_df[filtered_df[value_cols].ne(0).any(axis=1)]['conId'].tolist()
    # growth_symbols = filtered_df[filtered_df[growth_cols].ne(0).any(axis=1)]['conId'].tolist()

    # intersection = set(value_symbols) & set(growth_symbols)
    # value_symbols = [s for s in value_symbols if s not in intersection]
    # growth_symbols = [s for s in growth_symbols if s not in intersection]
    # hml_etf = construct_long_short_factor_returns(filtered_df, pct_changes, value_symbols, growth_symbols)
    # factors['factor_hml'] = hml_etf

    # long.append(len(value_symbols))
    # short.append(len(growth_symbols))
    # differences.append(np.abs(len(value_symbols) - len(growth_symbols)))

    # Metadata
    excluded = ['style_', 'marketcap_', 'countries_','momentum_', 'fundamentals_', ]
    numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
    for col in numerical_cols:
        if not any(col.startswith(prefix) for prefix in excluded) and col in filtered_df.columns:
            try:
                std = filtered_df[col].std()
                mean = filtered_df[col].mean()

                upper_boundary = min(filtered_df[col].max(), mean + (scaling_factor * std))
                lower_boundary = max(filtered_df[col].min(), mean - (scaling_factor * std))

                low_factor_symbols = filtered_df[filtered_df[col] <= lower_boundary]['conId'].tolist()
                high_factor_symbols = filtered_df[filtered_df[col] >= upper_boundary]['conId'].tolist()
                if col.endswith('variety'):
                    var_etf = construct_long_short_factor_returns(filtered_df, pct_changes, low_factor_symbols, high_factor_symbols, factor_column=col)
                else:
                    var_etf = construct_long_short_factor_returns(filtered_df, pct_changes, high_factor_symbols, low_factor_symbols, factor_column=col)
                var_etf = construct_long_short_factor_returns(filtered_df, pct_changes, high_factor_symbols, low_factor_symbols, factor_column=col)
                factors[col] = var_etf

                long.append(len(low_factor_symbols))
                short.append(len(high_factor_symbols))
                differences.append(np.abs(len(low_factor_symbols) - len(high_factor_symbols)))
            except Exception as e:
                print(col)
                print(e)
                raise
        
    if diffs:
        diffs = {'long': long,
                 'short': short,
                 'diffs': differences}

        return pd.DataFrame(factors), pd.DataFrame(diffs)
    return pd.DataFrame(factors)

In [None]:
def prescreen_factors(factors_df, correlation_threshold=0.99, drop_map=None):
    if factors_df is None or factors_df.empty or factors_df.shape[1] == 0:
        raise ValueError("factors_df must be a non-empty DataFrame with at least one column.")
    temp_factors_df = factors_df.copy()

    corr_matrix = temp_factors_df.corr().abs()
    corr_pairs = corr_matrix.where(np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)).stack()
    corr_pairs = corr_pairs.sort_values(ascending=False)

    if not drop_map:
        drop_map = {}
    col_order = list(temp_factors_df.columns)
    for (col1, col2), corr_val in corr_pairs.items():
        if corr_val < correlation_threshold:
            break

        already_dropped = {c for drops in drop_map.values() for c in drops}
        if col1 in already_dropped or col2 in already_dropped:
            continue

        if col_order.index(col1) < col_order.index(col2):
            keeper, to_drop = col1, col2
        else:
            keeper, to_drop = col2, col1

        drop_map.setdefault(keeper, []).append(to_drop)

    cols_to_drop = set(col for drops in drop_map.values() for col in drops)
    temp_factors_df = temp_factors_df.drop(columns=cols_to_drop)
    return temp_factors_df, drop_map

def merge_drop_map(drop_map):
    cols_to_drop = set(col for drops in drop_map.values() for col in drops)
    final_drop_map = {}
    for keeper, direct_drops in drop_map.items():
        if keeper not in cols_to_drop:
            cols_to_check = list(direct_drops) 
            all_related_drops = set(direct_drops)
            while cols_to_check:
                col = cols_to_check.pop(0)
                if col in drop_map:
                    new_drops = [d for d in drop_map[col] if d not in all_related_drops]
                    cols_to_check.extend(new_drops)
                    all_related_drops.update(new_drops)
            
            final_drop_map[keeper] = sorted(list(all_related_drops))
    
    return final_drop_map


In [None]:
# OLS Regression function
import statsmodels.api as sm

def run_regressions(distilled_factors):
    results = []
    for symbol in pct_changes.columns:
        etf_excess = pct_changes[symbol] - risk_free_df['daily_nominal_rate']
        data = pd.concat([etf_excess.rename('etf_excess'), distilled_factors], axis=1).dropna()

        Y = data['etf_excess']
        X = sm.add_constant(data.iloc[:, 1:])
        model = sm.OLS(Y, X).fit()
        result = {
            'conId': symbol,
            'nobs': model.nobs,
            'r_squared': model.rsquared,
            'r_squared_adj': model.rsquared_adj,
            'f_statistic': model.fvalue,
            'f_pvalue': model.f_pvalue,
            'aic': model.aic,
            'bic': model.bic,
            'condition_number': model.condition_number,
            'alpha': model.params['const'],
            'alpha_pval': model.pvalues['const'],
            'alpha_tval': model.tvalues['const'],
            'alpha_bse': model.bse['const'],
        }
        for factor in distilled_factors.columns:
            result[f'beta_{factor}'] = model.params[factor]
            result[f'pval_beta_{factor}'] = model.pvalues[factor]
            result[f'tval_beta_{factor}'] = model.tvalues[factor]
            result[f'bse_beta_{factor}'] = model.bse[factor]
        results.append(result)

    results_df = pd.DataFrame(results)
    return results_df
    # del X, Y, model, data, etf_excess, result, results

In [None]:
# # Testing correlation thresholds and scaling factors
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# def calculate_vif(df):
#     vif_data = pd.DataFrame()
#     vif_data["feature"] = df.columns
#     vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
#     return vif_data.sort_values(by='VIF', ascending=False)

# # Old greedy iterative VIF pruning
# # max_vif_threshold = 99999
# # while True:
# #     vif_df = calculate_vif(distilled_factors.fillna(0))
# #     highest_vif = vif_df['VIF'].iloc[0]
# #     if highest_vif > max_vif_threshold and distilled_factors.shape[1] > 2:
# #         feature_to_drop = vif_df['feature'].iloc[0]
# #         distilled_factors.drop(columns=[feature_to_drop], inplace=True)
# #         cols_dropped.add(feature_to_drop)
# #         print(f'{feature_to_drop} - {highest_vif}')
# #     else:
# #         break

# if input("test corr and scale? (y/n)").lower() == 'y':
#     c_range = np.arange(0.6,1,0.02)
#     z_range = np.arange(0,2,0.2)
#     results = []
#     for c in tqdm(c_range, total=len(c_range)):
#         for z in tqdm(z_range, total=len(z_range)):
#             factors_df = construct_factors(filtered_df, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=z)
#             distilled_factors, _ = prescreen_factors(factors_df, correlation_threshold=c)

#             max_vif_threshold = 10
#             while True:
#                 vif_df = calculate_vif(distilled_factors.fillna(0))
#                 highest_vif = vif_df['VIF'].iloc[0]
#                 if highest_vif > max_vif_threshold and distilled_factors.shape[1] > 2:
#                     feature_to_drop = vif_df['feature'].iloc[0]
#                     distilled_factors.drop(columns=[feature_to_drop], inplace=True)
#                 else:
#                     break

#             results_df = run_regressions(distilled_factors)
#             results.append((z, c, results_df.condition_number.mean()))

#     df_plot = pd.DataFrame(results, columns=['z', 'c', 'vif'])
#     df_plot['vif'] = df_plot['vif'].replace([np.inf, -np.inf], np.nan)
#     # df_plot['vif'] = df_plot['vif'].fillna(max_finite_vif * 1.5)

#     df_pivot = df_plot.pivot(index='c', columns='z', values='vif')

#     plt.figure(figsize=(10, 6))
#     sns.heatmap(df_pivot, cmap='viridis', annot=False, cbar_kws={'label': 'Max VIF'})
#     plt.xlabel('z-score')
#     plt.ylabel('correlation threshold')
#     plt.title('VIF Heatmap (Original Grid)')
#     plt.show()


In [None]:
# Construct factors
from statsmodels.stats.outliers_influence import variance_inflation_factor
from IPython.display import Markdown

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)

factors_df, diffs = construct_factors(filtered_df, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=0.6, diffs=True)

# Custom drop
low_absolute_beta = ['profile_cap_usd', 'holding_types_equity', 'industries_BasicMaterials', 'continent_Oceania_beta', 'holding_types_bond_beta']#, 'factor_smb_beta']
frequently_lassoed = ['gdp_pcap_growth', 'gdp_pcap_acceleration', 'continent_Africa', 'population_value', 'industries_Financials_beta', 'stats_sharpe']
walk_forward = ['industries_Healthcare_beta', 'continent_America_beta']#, 'stats_sharpe', 'factor_momentum_beta', 'factor_profitability_beta']
custom_drop = low_absolute_beta + frequently_lassoed + walk_forward
custom_drop = [c.split('_beta')[0] for c in custom_drop]
factors_df = factors_df.drop(columns=custom_drop, errors='ignore')

# Screen factors
distilled_factors, drop_map = prescreen_factors(factors_df, correlation_threshold=0.95)

# corr_matrix = distilled_factors.corr()
# vif_df = calculate_vif(distilled_factors.dropna(axis=0))
# highest_vif = vif_df['VIF'].iloc[0]
# if distilled_factors.shape[1] > 2:
#     to_drop = vif_df['feature'].iloc[0]
#     distilled_factors.drop(columns=[to_drop], inplace=True)

# np.fill_diagonal(corr_matrix.values, 0)
# keeper = corr_matrix[to_drop].sort_values(ascending=False).index[0]
# drop_map.setdefault(keeper, []).append(to_drop)

drop_map = merge_drop_map(drop_map)

In [None]:
# Print Final factors
if drop_map:
    display(pd.Series(drop_map))
print(distilled_factors.shape)

display(Markdown('## Factors included:'))
for cat in categories:
    cat_list = [col.split(cat)[-1].strip('_').capitalize() for col in distilled_factors.columns if col.startswith(cat)]
    if cat_list:
        print(f'{(',  ').join(cat_list)}')

In [None]:
calculate_vif(distilled_factors.dropna(axis=0).drop(columns='industries_Technology'))
# calculate_vif(distilled_factors.dropna(axis=0))

In [None]:
corr_matrix = distilled_factors.corr().abs()
corr_pairs = corr_matrix.where(np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)).stack()
corr_pairs.sort_values(ascending=False)


In [None]:
# ElasticNet regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

def run_elastic_net(factors_df,
                    pct_changes,
                    risk_free_df,
                    training_cutoff,
                    alphas=np.logspace(-4, 1, 50),
                    l1_ratio=[.1, .5, .9],
                    cv=5,
                    tol=5e-4,
                    random_state=42):

    data = data = (
        factors_df.copy()
        .join(pct_changes, how='inner')
        .join(risk_free_df[['daily_nominal_rate']], how='inner')
        .fillna(0)
    )

    train = data[data.index < training_cutoff]
    test = data[data.index >= training_cutoff]

    X_train = train[factors_df.columns].values
    X_test = test[factors_df.columns].values
    
    metrics = []
    for etf in tqdm(pct_changes.columns, total=len(pct_changes.columns), desc="Elastic Net Regression"):
        Y_train = train[etf].values - train['daily_nominal_rate'].values
        Y_test = test[etf].values - test['daily_nominal_rate'].values

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('enet', ElasticNetCV(alphas=alphas,
                                l1_ratio=l1_ratio,
                                cv=cv,
                                random_state=random_state,
                                max_iter=499999,
                                tol=tol,
                                fit_intercept=True,
                                n_jobs=-1)),
        ])

        try:
            pipeline.fit(X_train, Y_train)
        except ValueError as e:
            print(f"Skipping {etf} due to error: {e}")
            continue

        # Unscale coefficients and intercept
        enet = pipeline.named_steps['enet']
        scaler = pipeline.named_steps['scaler']
        betas_train = enet.coef_ / scaler.scale_
        intercept = enet.intercept_ - np.dot(betas_train, scaler.mean_)

        # out-of-sample stats
        er_test = pipeline.predict(X_test)

        # in-sample stats
        er_train = pipeline.predict(X_train)

        row = {
            'conId': etf,
            'jensens_alpha': intercept,
            'enet_alpha': enet.alpha_,
            'l1_ratio': enet.l1_ratio_,
            'n_iter': enet.n_iter_,
            'dual_gap': enet.dual_gap_,
            'n_nonzero': np.sum(np.abs(betas_train) > 1e-6),
            # 'mse_path_grid': enet.mse_path_,
            'cv_mse_best': np.min(enet.mse_path_.mean(axis=2)),
            'cv_mse_average': np.mean(enet.mse_path_.mean(axis=2)),
            'cv_mse_worst': np.max(enet.mse_path_.mean(axis=2)),
            'mse_test' : mean_squared_error(Y_test, er_test),
            'mse_train' : mean_squared_error(Y_train, er_train),
            'r2_test' : r2_score(Y_test, er_test),
            'r2_train' : r2_score(Y_train, er_train),
        }

        # Map back coefficients to factor names
        for coef, fname in zip(betas_train, factors_df.columns):
            row[f'{fname}_beta'] = coef

        metrics.append(row)
    
    results_df = pd.DataFrame(metrics).set_index('conId')
    return results_df

results_df = run_elastic_net(
    factors_df=distilled_factors,
    pct_changes=pct_changes,
    risk_free_df=risk_free_df,
    training_cutoff=training_cutoff,
    alphas=np.logspace(-11, -4, 30),
    l1_ratio=[0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 1],
    cv=5,
    tol=5e-4,
)

In [None]:
# from scipy.stats import skew
# from scipy.optimize import minimize_scalar

# def optimize_scalar(series):
#     def obj(s):
#         if s <= 0:
#             return np.inf
#         return skew(np.log1p(s * series))**2

#     result = minimize_scalar(obj, bounds=(1e-5, 1e20), method='bounded')
#     print(result.x)
#     return result.x

# testing = load('data/walk_forward_results.csv')
# testing
# # walk_forward = ['factor_momentum_beta', 'industries_Healthcare_beta', 'continent_America_beta', 'factor_profitability_beta']
# # testing = testing.drop(columns= walk_forward, errors='ignore')
# beta_cols = [col for col in testing if col.endswith('beta')]

# testing['r2_test_adj'] = testing['r2_test'].max() - testing['r2_test']
# testing['r2_test_adj'] = np.log1p(testing['r2_test_adj'] * optimize_scalar(testing['r2_test_adj']))
# testing['r2_test_adj'] = testing['r2_test_adj'].max() - testing['r2_test_adj']

# testing['cv_mse_std'] = testing[['cv_mse_best','cv_mse_average','cv_mse_worst']].std(axis=1)
# testing['cv_mse_std'] = np.log1p(testing['cv_mse_std'] * optimize_scalar(testing['cv_mse_std']))

# testing['screening_score'] = testing['r2_test_adj'] / (1 + testing['cv_mse_std'])

# temp = pd.DataFrame()
# temp['r2_adj'] = testing[beta_cols].abs().multiply(testing['screening_score'], axis=0).mean()
# temp['count'] = (testing[beta_cols].abs() > 1e-6).sum()
# temp['count'] = (temp['count'] - temp['count'].min()) / (temp['count'].max() - temp['count'].min())
# temp['mean'] = temp.mean(axis=1)
# temp

In [None]:
# Factor beta post-screening score 
from scipy.stats import skew
from scipy.optimize import minimize_scalar

def optimize_scalar(series):
    def obj(s):
        if s <= 0:
            return np.inf
        return skew(np.log1p(s * series))**2

    result = minimize_scalar(obj, bounds=(1e-5, 1e20), method='bounded')
    print(result.x)
    return result.x

beta_cols = [col for col in results_df if col.endswith('beta')]
screening_df = pd.DataFrame(index=results_df.index)

screening_df['r2_test'] = results_df['r2_test'].max() - results_df['r2_test']
screening_df['r2_test'] = np.log1p(screening_df['r2_test'] * optimize_scalar(screening_df['r2_test']))
screening_df['r2_test'] = screening_df['r2_test'].max() - screening_df['r2_test']

screening_df['cv_mse_std'] = results_df[['cv_mse_best','cv_mse_average','cv_mse_worst']].std(axis=1)
screening_df['cv_mse_std'] = np.log1p(screening_df['cv_mse_std'] * optimize_scalar(screening_df['cv_mse_std']))

screening_df['screening_score'] = screening_df['r2_test'] / (1 + screening_df['cv_mse_std'])

screening_df = results_df[beta_cols].abs().multiply(screening_df['screening_score'], axis=0).mean()
# screening_df[screening_df < screening_df.mean() - screening_df.std()]#.sort_values()
screening_df.sort_values()

# 0.01 - 0.05
# .18 - .2

In [None]:
# Lasso count post-screening
beta_cols = [col for col in results_df if col.endswith('beta')]
good_factor_importance3 = results_df[results_df['r2_test'] > 0.05]

if not good_factor_importance3.empty:
    good_factor_importance3 = good_factor_importance3[beta_cols]
    non_zero = (good_factor_importance3.abs() > 1e-6).sum()
    good_factor_importance3 = pd.DataFrame({
        'non_zero_percentage': non_zero / len(results_df),
        'mean_abs_beta': good_factor_importance3.abs().mean()
    }).sort_values('mean_abs_beta', ascending=True)

    good_factor_importance3['mean'] = good_factor_importance3.mean(axis=1)
    display(good_factor_importance3.sort_values(by='non_zero_percentage'))

    # 0.6 - 0.64
    # .86 

In [None]:
# Lasso count post-screening
beta_cols = [col for col in results_df if col.endswith('beta')]
# good_factor_importance = results_df[results_df['r2_test'] > 0.05]

# if not good_factor_importance.empty:
#     # good_factor_importance = good_factor_importance[beta_cols]
#     non_zero = (good_factor_importance.abs() > 1e-6).sum()
#     good_factor_importance = pd.DataFrame({
#         'non_zero_percentage': non_zero / len(results_df),
#         'mean_abs_beta': good_factor_importance.abs().mean()
#     }).sort_values('mean_abs_beta', ascending=True)

    # good_factor_importance['mean'] = good_factor_importance.mean(axis=1)
display(good_factor_importance.sort_values(by='non_zero_percentage'))

    # 0.6 - 0.64
    # .86 

# Modern portfolio theory

In [None]:
# Factor-based ER
beta_cols = [col for col in results_df.columns if col.endswith('beta')]
asset_betas = results_df[beta_cols]
asset_betas.columns = [col.replace('_beta', '') for col in beta_cols]
asset_betas = asset_betas[distilled_factors.columns]

factor_premia = distilled_factors.mean()
factor_premia[factor_premia > 0] = 0
factor_premia *= -1

systematic_returns = asset_betas.dot(factor_premia)
factor_based_er = results_df['jensens_alpha'] + systematic_returns

In [None]:
# Factor-based + model-adjusted ER - mu_utility
def optimize_scalar(series):
    def obj(s):
        if s <= 0:
            return np.inf
        return skew(np.log1p(s * series))**2

    result = minimize_scalar(obj, bounds=(1e-5, 1e20), method='bounded')
    print(result.x)
    return result.x

screening_df = pd.DataFrame(index=results_df.index)
screening_df['expected_return'] = factor_based_er
screening_df['expected_return'] -= screening_df['expected_return'].min()

screening_df['r2_test'] = results_df['r2_test'].max() - results_df['r2_test']
screening_df['r2_test'] = np.log1p(screening_df['r2_test'] * optimize_scalar(screening_df['r2_test']))
screening_df['r2_test'] = screening_df['r2_test'].max() - screening_df['r2_test']

screening_df['cv_mse_std'] = results_df[['cv_mse_best','cv_mse_average','cv_mse_worst']].std(axis=1)
screening_df['cv_mse_std'] = np.log1p(screening_df['cv_mse_std'] * optimize_scalar(screening_df['cv_mse_std']))
screening_df['cv_mse_std'] = screening_df['cv_mse_std'].max() - screening_df['cv_mse_std']

scaler = MinMaxScaler()
screening_df[['r2_test', 'cv_mse_std']] = scaler.fit_transform(screening_df[['r2_test', 'cv_mse_std']])
screening_df['r2_adjusted_er'] = screening_df['expected_return'] * screening_df['r2_test'] * screening_df['cv_mse_std']
screening_df['historical_er'] = pct_changes.mean()

mu_utility = screening_df['r2_adjusted_er'] 
mu_historical = pct_changes.mean()

screening_df.corr().sort_values(by='historical_er')

In [None]:
# COV - S
factor_cov_matrix = distilled_factors.cov()
idiosyncratic_variances = results_df['mse_train']
D = np.diag(results_df['mse_train'])

systematic_cov = asset_betas.values @ factor_cov_matrix.values @ asset_betas.values.T
S = pd.DataFrame(
    systematic_cov + D,
    index=results_df.index,
    columns=results_df.index
)

# Risk-free rate 
rf_rate = risk_free_df['daily_nominal_rate'].iloc[-10:].mean()

In [None]:
# Portfolio Mean-Variance + Factor Exposure Balance Optimization
import cvxpy as cp
from pypfopt import base_optimizer

def portfolio_factor_dispersion(w, asset_betas):
    portfolio_betas = w @ asset_betas
    n_factors = asset_betas.shape[1]
    demeaned_betas = portfolio_betas - (cp.sum(portfolio_betas) / n_factors)
    return cp.norm(demeaned_betas, 2)

def optimize_with_factor_balance(mu, S, asset_betas, cuttoff_threshold, upper_bounds, solver='CLARABEL'):
    n_assets = len(mu)
    w = cp.Variable(n_assets)

    expected_return = mu.values @ w
    portfolio_risk = cp.quad_form(w, S)
    factor_dispersion = portfolio_factor_dispersion(w, asset_betas.values)
    
    objective = cp.Maximize(
        expected_return
        - portfolio_risk 
        - factor_dispersion
    )
    
    constraints = [cp.sum(w) == 1, w >= 0, w <= upper_bounds]

    problem = cp.Problem(objective, constraints)
    problem.solve(solver=solver)

    if problem.status not in ["optimal", "optimal_inaccurate"]:
        print(f"Warning: Optimal solution not found. Status: {problem.status}")
        return None

    weights = pd.Series(w.value, index=mu.index)
    weights[np.abs(weights) < cuttoff_threshold] = 0
    weights /= weights.sum()
    
    return weights


# Higher value = more penalty for that term.
cuttoff_threshold = .05
upper_bounds = 1

weights = optimize_with_factor_balance(
    mu=mu_utility,
    S=S,
    asset_betas=asset_betas,
    cuttoff_threshold=cuttoff_threshold,
    upper_bounds=upper_bounds,
)

df = weights[weights > 0].sort_values(ascending=False)
for k,v in df.items():
    row = filtered_df[filtered_df['conId'] == k]
    symbol = row['symbol'].iloc[0]
    print(f'{symbol}: {round(v*100, 2)}%')


rf_rate = risk_free_df['daily_nominal_rate'].iloc[-10:].mean()
expected_return, volatility, sharpe_ratio = base_optimizer.portfolio_performance(weights, mu_utility, S, risk_free_rate=rf_rate)

print(f'num_etfs: {len(weights[weights > 0])}')
final_portfolio_betas = weights @ asset_betas
print(f'beta_std: {final_portfolio_betas.std()}')
print(f'volatility: {volatility}')
print(f'sharpe: {sharpe_ratio}')

### Bayesian optimization of hyperparams

In [None]:
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args


def progress_callback(res):
    pbar.update(1)

space  = [
    Real(0, .05, prior='uniform', name='cuttoff_threshold'),
    Real(.2, 1, prior='uniform', name='upper_bounds')
    ]
@use_named_args(space)
def objective(**params):
    weights = optimize_with_factor_balance(
        mu=mu_utility,
        S=S,
        asset_betas=asset_betas,
        cuttoff_threshold=params['cuttoff_threshold'],
        upper_bounds=params['upper_bounds'],
    )
    if weights.isna().sum():
        score = 100 / params['upper_bounds'] - params['cuttoff_threshold']
        print(f'Empty: {round(params['cuttoff_threshold'], 3)} - {round(params['upper_bounds'], 3)}  Score: {score}')
        return score

    num_assets = len(weights[weights > 0])
    final_portfolio_betas = weights @ asset_betas

    er_model, std_model, sharpe_model = base_optimizer.portfolio_performance(weights, mu_utility, S, risk_free_rate=rf_rate)
    er_hist, _, sharpe_hist = base_optimizer.portfolio_performance(weights, mu_historical, S, risk_free_rate=rf_rate)
    score = (sharpe_model) / np.sqrt(num_assets)

    evaluated_results.append({
        'cuttoff_threshold': params['cuttoff_threshold'],
        'upper_bounds': params['upper_bounds'],
        'score': score,
        'sharpe_model': sharpe_model,
        'sharpe_hist': sharpe_hist,
        'er_model': er_model,
        'er_hist': er_hist,
        'volatility': std_model,
        'factor_beta_std': final_portfolio_betas.std(),
        'num_assets': num_assets,
        'weights': weights[weights > 0],
    })

    return -score


evaluated_results = []
n_calls = 32
n_initial_points = n_calls//2
n_initial_points = 2**4

n_calls = n_initial_points # Delete later
if 'pbar' in globals():
    pbar.close()
pbar = tqdm(total=n_calls)

result = gp_minimize(
    func=objective,
    dimensions=space,
    n_calls=n_calls,
    # initial_point_generator='lhs',
    initial_point_generator='sobol',
    n_initial_points=n_initial_points,
    random_state=42,
    callback=[progress_callback],
)

In [None]:
# Plot parameter exploration 2d
import matplotlib.pyplot as plt

eval_df = pd.DataFrame(evaluated_results)
# eval_df['score'] = (eval_df['sharpe_model'] - eval_df['factor_beta_std']) / np.sqrt(eval_df['num_assets'])

x = 'cuttoff_threshold'
y = 'score'

plt.scatter(eval_df[x], eval_df[y], c=eval_df['sharpe_model'], cmap='viridis')
# plt.scatter(optimal_dict[x], optimal_dict[y], color='red', marker='D')
plt.xlabel(x)
plt.ylabel(y)
plt.colorbar()
plt.legend()
plt.show()

In [None]:
# Plot 3D
import plotly.express as px

def to_text(df):
    lines = []
    for k,v in df.items():
        row = filtered_df[filtered_df['conId'] == k]
        symbol = row['symbol'].iloc[0]
        lines.append(f'{symbol}: {round(v*100, 2)}%')
    return '<br>'.join(lines)

eval_df = pd.DataFrame(evaluated_results)
eval_df['weights'] = eval_df['weights'].apply(lambda x: to_text(x.sort_values(ascending=False)))

fig = px.scatter_3d(
    eval_df,
    x='upper_bounds',
    y='cuttoff_threshold',
    z='score',
    color='sharpe_model',
    size='num_assets',
    color_continuous_scale=px.colors.sequential.Viridis,
    hover_data={
        'score': True,
        'num_assets': True,
        'sharpe_model': True,
        'factor_beta_std': True,
        'cuttoff_threshold': True,
        'weights': True,
        }
)
fig.update_layout(height=800)
fig.show()

In [None]:
# Create the heatmap
eval_df = pd.DataFrame(evaluated_results)
eval_corr = eval_df.drop('weights', axis=1).corr()
plt.figure(figsize=(8, 6))
sns.heatmap(eval_corr, annot=True, cmap='coolwarm', center=0)
plt.show()

In [None]:
from pypfopt import DiscreteAllocation
latest_prices = ... # Get the most recent prices for your ETFs
da = DiscreteAllocation(cleaned_weights, latest_prices, total_portfolio_value=25000)
allocation, leftover = da.lp_portfolio()
print("Discrete allocation:", allocation)
print(f"Funds remaining: ${leftover:.2f}")