In [None]:
import os
import pandas as pd
import numpy as np
import re
import ast
from tqdm.auto import tqdm
from datetime import datetime, timedelta

from ib_async import *
import pandas_datareader.data as web
import wbgapi as wb
import country_converter as coco

from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

import gc
import argparse

from scipy.interpolate import interp1d

from scipy.stats import linregress
from scipy.stats import skew
from scipy.optimize import minimize_scalar

from pathlib import Path
if 'MAX_STALE_DAYS' not in globals():
    os.chdir(Path.cwd().parents[0]) # Set path as if it was in root

In [None]:
# CONSTANTS
CORRELATION_THRESHOLD = .85

# Data Cleaning
MAX_STALE_DAYS = 5
# Default params for detect_and_nullify_global_outliers
Z_THRESHOLD_GLOBAL_DEFAULT = 100.0 # 120
OUTLIER_WINDOW_DEFAULT = 5
# Params for detect_and_nullify_global_outliers in the main loop
Z_THRESHOLD_GLOBAL_LOOP = 50

# Walk-Forward Analysis
WALK_FORWARD_WINDOW_YEARS = range(3, 5)
TRAINING_PERIOD_DAYS = 365
# TRAINING_PERIOD_DAYS = 200
MOMENTUM_PERIODS_DAYS = {
    '1y':  TRAINING_PERIOD_DAYS,
    '6mo': TRAINING_PERIOD_DAYS // 2,
    '3mo': TRAINING_PERIOD_DAYS // 4,
}

# Asset Filtering
MAX_GAP_LOG = 3.05
MAX_PCT_MISSING = 0.3

# Factor Construction
FACTOR_SCALING_FACTOR = 0.6

# Elastic Net Hyperparameters
ENET_ALPHAS = np.logspace(-11, -4, 30)
ENET_L1_RATIOS = [0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 1]
ENET_CV = 5
ENET_TOL = 5e-4

# Optimization almost 0
ZERO = 1e-12

In [None]:
def fetch_world_bank_data(all_country_codes, start_date, end_date, indicators):
    valid_country_codes = {code for code in all_country_codes if code is not None}
    try:
        wb_economies = {e['id'] for e in wb.economy.list()}
    except Exception as e:
        raise Exception(f"FATAL: Failed to fetch economy list from World Bank API: {e}")

    final_economies = sorted([code for code in valid_country_codes if code in wb_economies])
    unrecognized = valid_country_codes - set(final_economies)
    if unrecognized:
        print(f"Info: The following economies were not recognized by the World Bank API and will be skipped: {unrecognized}")
    if not final_economies:
        raise Exception("Error: No valid economies found to query the World Bank API.")

    all_data = []
    chunk_size = 40
    for i in range(0, len(final_economies), chunk_size):
        chunk = final_economies[i:i + chunk_size]
        try:
            data_chunk = wb.data.DataFrame(list(indicators), chunk, time=range(start_date.year - 5, end_date.year + 1), labels=False)
            all_data.append(data_chunk)
        except wb.APIError as e:
            print(f"API Error fetching data for chunk {i//chunk_size + 1}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred fetching data for chunk {i//chunk_size + 1}: {e}")

    if not all_data:
        raise Exception("Error: Failed to retrieve any data from the World Bank.")

    return pd.concat(all_data)


def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

def ensure_series_types(df, price_col):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    for col in ['volume', price_col]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def validate_raw_prices(df, price_col):
    invalid_price_mask = df[price_col] <= 0
    inconsistent_mask = pd.Series(False, index=df.index)
    if 'low' in df.columns and 'high' in df.columns:
        inconsistent_mask = (df['low'] > df['high'])
    local_error_mask = invalid_price_mask | inconsistent_mask
    df = df[~local_error_mask].copy()
    return df

def handle_stale_periods(df, price_col, max_stale_days=MAX_STALE_DAYS):
    stale_groups = (df[price_col].diff() != 0).cumsum()
    if stale_groups.empty:
        return df
    period_lengths = df.groupby(stale_groups)[price_col].transform('size')
    long_stale_mask = period_lengths > max_stale_days
    is_intermediate_stale_row = (stale_groups.duplicated(keep='first') & stale_groups.duplicated(keep='last'))
    rows_to_drop_mask = long_stale_mask & is_intermediate_stale_row
    df = df[~rows_to_drop_mask].copy()
    return df

def detect_and_nullify_global_outliers(meta_df, price_col, z_threshold=Z_THRESHOLD_GLOBAL_DEFAULT, window=OUTLIER_WINDOW_DEFAULT):
    all_pct_changes = pd.concat(
        [row['df']['pct_change'] for _, row in meta_df.iterrows()],
        ignore_index=True
    ).dropna()
    all_pct_changes = all_pct_changes[~np.isinf(all_pct_changes) & (all_pct_changes != 0)]

    global_median_return = all_pct_changes.median()
    global_mad = (all_pct_changes - global_median_return).abs().median()

    for idx, row in meta_df.iterrows():
        df = row['df']
        df = df.reset_index(drop=True)
        if df['pct_change'].isnull().all():
            continue
        cols_to_null = [price_col, 'volume', 'high', 'low', 'pct_change']
        cols_to_null = [c for c in cols_to_null if c in df.columns]

        absolute_modified_z = (df['pct_change'] - global_median_return).abs() / global_mad
        outlier_mask = absolute_modified_z > z_threshold

        if outlier_mask.any():

            candidate_indices = df.index[outlier_mask]
            for df_idx in candidate_indices:
                price_to_check_idx = df_idx - 1
                price_to_check = df.loc[price_to_check_idx, price_col]
                local_window_start = max(0, price_to_check_idx - window)
                local_window = df.loc[local_window_start : price_to_check_idx - 1, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std != 0: 
                    price_z_score = abs(price_to_check - local_mean) / local_std
                    if price_z_score > z_threshold / 10:
                        df.loc[price_to_check_idx, cols_to_null] = np.nan

                price_to_check = df.loc[df_idx, price_col]
                local_window_end = min(df_idx + window, df.index[outlier_mask].max())
                local_window = df.loc[df_idx + 1: local_window_end, price_col].dropna()
                local_mean = local_window.mean()
                local_std = local_window.std()
                if local_std != 0:
                    price_z_score = abs(price_to_check - local_mean) / local_std
                    if price_z_score > z_threshold / 10:
                        df.loc[df_idx, cols_to_null] = np.nan

            df['pct_change'] = df[price_col].pct_change(fill_method=None)
            meta_df.at[idx, 'df'] = df

def calculate_slope(value1, value2, time1, time2):
    return (value1 - value2) / (time1 - time2)

def get_return_stats(df, training_cutoff, momentum_cutoffs, risk_free_df):
    training_df = df[df.index < training_cutoff]
    # training_rf = risk_free_df[risk_free_df.index < training_cutoff]

    # excess_returns = training_df['pct_change'] - training_rf['daily_nominal_rate']
    # sharpe = excess_returns.mean() / excess_returns.std() if excess_returns.std() != 0 else 0

    momentum_3mo = training_df[training_df.index >= momentum_cutoffs['3mo']]['pct_change'].mean()
    momentum_6mo = training_df[training_df.index >= momentum_cutoffs['6mo']]['pct_change'].mean()
    momentum_1y  = training_df[training_df.index >= momentum_cutoffs['1y']]['pct_change'].mean()

    rs_3mo = (1 + training_df[training_df.index >= momentum_cutoffs['3mo']]['pct_change']).prod() - 1
    rs_6mo = (1 + training_df[training_df.index >= momentum_cutoffs['6mo']]['pct_change']).prod() - 1
    rs_1y  = (1 + training_df[training_df.index >= momentum_cutoffs['1y']]['pct_change']).prod() - 1

    return pd.Series([momentum_3mo, 
                      momentum_6mo, 
                      momentum_1y, 
                      rs_3mo, 
                      rs_6mo, 
                      rs_1y,], 
                    #   sharpe], 
              index=['momentum_3mo', 
                     'momentum_6mo', 
                     'momentum_1y', 
                     'rs_3mo', 
                     'rs_6mo', 
                     'rs_1y', ])
                    #  'stats_sharpe'])

def create_continent_map(standard_names):
    continents = cc.convert(names=standard_names, to='continent', not_found=None)
    return {name: (cont if cont is not None else 'Other')
            for name, cont in zip(standard_names, continents)}

def calculate_country_stats(world_bank_data_full, standard_names, end_year, window_size=3):
    countries_in_window = [name for name in standard_names.values() if name in world_bank_data_full.index.get_level_values('economy')]
    if not countries_in_window:
        return pd.DataFrame()
    
    data = world_bank_data_full.loc[countries_in_window].dropna(axis=1)
    available_years = [int(col) for col in data.columns]

    cols_to_keep = [col for col, year in zip(data.columns, available_years) if year <= end_year.year]
    data = data[cols_to_keep].copy()
    data.dropna(axis=1, inplace=True)

    yoy_change = data.diff(axis=1)
    first_div = yoy_change.T.rolling(window=window_size).mean().T

    # yoy_change_first_div = first_div.diff(axis=1)
    # second_div = yoy_change_first_div.T.rolling(window=window_size).mean().T

    latest_year_col = data.columns[-1]
    latest_first_div_col = first_div.columns[-1]
    # latest_second_div_col = second_div.columns[-1]

    derivatives = pd.DataFrame(data[latest_year_col])
    derivatives.rename(columns={latest_year_col: 'raw_value'}, inplace=True)
    derivatives['1st_div'] = first_div[latest_first_div_col] / derivatives['raw_value']
    # derivatives['2nd_div'] = second_div[latest_second_div_col] / derivatives['raw_value']
    
    metric_df_reshaped = derivatives.unstack(level='series')
    if isinstance(metric_df_reshaped.columns, pd.MultiIndex):
         metric_df_final = metric_df_reshaped.swaplevel(0, 1, axis=1)
         metric_df_final.sort_index(axis=1, level=0, inplace=True)
    else:
         metric_df_final = metric_df_reshaped

    return metric_df_final

def construct_factor_series(meta_df, returns_df, long_symbols, short_symbols=None, factor_column=None):
    long_df = meta_df[meta_df['conId'].isin(long_symbols)].set_index('conId')
    long_weights = long_df['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
    if factor_column:
        factor_weights = (meta_df[factor_column].max() - long_df[factor_column]) / (meta_df[factor_column].max() - meta_df[factor_column].min())
        factor_weights = factor_weights.reindex(returns_df.columns).fillna(0)
        if factor_weights.sum() != 0:
            long_weights *= factor_weights

    if long_weights.sum() != 0:
        long_weights /= long_weights.sum()
    long_returns = returns_df.dot(long_weights)

    if short_symbols:
        short_df = meta_df[meta_df['conId'].isin(short_symbols)].set_index('conId')
        short_weights = short_df['profile_cap_usd'].reindex(returns_df.columns).fillna(0)
        if factor_column:
            factor_weights = (short_df[factor_column] - meta_df[factor_column].min()) / (meta_df[factor_column].max() - meta_df[factor_column].min())
            factor_weights = factor_weights.reindex(returns_df.columns).fillna(0)
            if factor_weights.sum() != 0:
                short_weights *= factor_weights

        if short_weights.sum() != 0:
            short_weights /= short_weights.sum()
        short_returns = returns_df.dot(short_weights)
        
        return long_returns - short_returns
    else:
        return long_returns

def construct_factors(filtered_df, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=FACTOR_SCALING_FACTOR):
    factors = {}
    # Market risk premium
    factors['factor_market_premium'] = (portfolio_dfs['equity']['pct_change'] - risk_free_df['daily_nominal_rate'])

    # SMB_ETF
    small_symbols = filtered_df[filtered_df['marketcap_small'] == 1]['conId'].tolist()
    large_symbols = filtered_df[filtered_df['marketcap_large'] == 1]['conId'].tolist()

    intersection = set(small_symbols) & set(large_symbols)
    small_symbols = [s for s in small_symbols if s not in intersection]
    large_symbols = [s for s in large_symbols if s not in intersection]
    smb_etf = construct_factor_series(filtered_df, pct_changes, small_symbols, short_symbols=large_symbols)
    factors['factor_smb'] = smb_etf

    # HML_ETF
    value_cols = [col for col in filtered_df.columns if col.startswith('style_') and col.endswith('value')]
    growth_cols = [col for col in filtered_df.columns if col.startswith('style_') and col.endswith('growth')]
    value_symbols = filtered_df[filtered_df[value_cols].ne(0).any(axis=1)]['conId'].tolist()
    growth_symbols = filtered_df[filtered_df[growth_cols].ne(0).any(axis=1)]['conId'].tolist()

    intersection = set(value_symbols) & set(growth_symbols)
    value_symbols = [s for s in value_symbols if s not in intersection]
    growth_symbols = [s for s in growth_symbols if s not in intersection]
    hml_etf = construct_factor_series(filtered_df, pct_changes, value_symbols, short_symbols=growth_symbols)
    factors['factor_hml'] = hml_etf

    # Metadata
    excluded = ['style_', 'marketcap_', 'countries_', 'fundamentals_', 'momentum_', 'industries_']
    only_long = ['industries_', 'holding_types_']
    numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
    for col in numerical_cols:
        if not any(col.startswith(prefix) for prefix in excluded):
            try:
                std = filtered_df[col].std()
                mean = filtered_df[col].mean()

                upper_boundary = min(filtered_df[col].max(), mean + (scaling_factor * std))
                lower_boundary = max(filtered_df[col].min(), mean - (scaling_factor * std))

                high_factor_symbols = filtered_df[filtered_df[col] >= upper_boundary]['conId'].tolist()
                low_factor_symbols = filtered_df[filtered_df[col] <= lower_boundary]['conId'].tolist()
                if col.endswith('variety'):
                    factor_series = construct_factor_series(filtered_df, pct_changes, low_factor_symbols, short_symbols=high_factor_symbols, factor_column=col)
                elif col.startswith(tuple(only_long)):
                    factor_series = construct_factor_series(filtered_df, pct_changes, high_factor_symbols, factor_column=col)
                else:
                    factor_series = construct_factor_series(filtered_df, pct_changes, high_factor_symbols, short_symbols=low_factor_symbols, factor_column=col)
                factors[col] = factor_series

            except Exception as e:
                print(col)
                print(e)
                raise

    return pd.DataFrame(factors)

def prescreen_factors(factors_df, correlation_threshold=CORRELATION_THRESHOLD, drop_map=None):
    if factors_df is None or factors_df.empty or factors_df.shape[1] == 0:
        raise ValueError("factors_df must be a non-empty DataFrame with at least one column.")
    temp_factors_df = factors_df.copy()

    corr_matrix = temp_factors_df.corr().abs()
    corr_pairs = corr_matrix.where(np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)).stack()
    corr_pairs = corr_pairs.sort_values(ascending=False)

    if not drop_map:
        drop_map = {}
    col_order = list(temp_factors_df.columns)
    for (col1, col2), corr_val in corr_pairs.items():
        if corr_val < correlation_threshold:
            break

        already_dropped = {c for drops in drop_map.values() for c in drops}
        if col1 in already_dropped or col2 in already_dropped:
            continue

        if col_order.index(col1) < col_order.index(col2):
            keeper, to_drop = col1, col2
        else:
            keeper, to_drop = col2, col1

        drop_map.setdefault(keeper, []).append(to_drop)

    cols_to_drop = set(col for drops in drop_map.values() for col in drops)
    temp_factors_df = temp_factors_df.drop(columns=cols_to_drop)
    return temp_factors_df, drop_map

def merge_drop_map(drop_map):
    cols_to_drop = set(col for drops in drop_map.values() for col in drops)
    final_drop_map = {}
    for keeper, direct_drops in drop_map.items():
        if keeper not in cols_to_drop:
            cols_to_check = list(direct_drops) 
            all_related_drops = set(direct_drops)
            while cols_to_check:
                col = cols_to_check.pop(0)
                if col in drop_map:
                    new_drops = [d for d in drop_map[col] if d not in all_related_drops]
                    cols_to_check.extend(new_drops)
                    all_related_drops.update(new_drops)
            
            final_drop_map[keeper] = sorted(list(all_related_drops))
    
    return final_drop_map

def run_regressions(distilled_factors):
    results = []
    for symbol in pct_changes.columns:
        etf_excess = pct_changes[symbol] - risk_free_df['daily_nominal_rate']
        data = pd.concat([etf_excess.rename('etf_excess'), distilled_factors], axis=1).dropna()

        Y = data['etf_excess']
        X = sm.add_constant(data.iloc[:, 1:])
        model = sm.OLS(Y, X).fit()
        result = {
            'conId': symbol,
            'nobs': model.nobs,
            'r_squared': model.rsquared,
            'r_squared_adj': model.rsquared_adj,
            'f_statistic': model.fvalue,
            'f_pvalue': model.f_pvalue,
            'aic': model.aic,
            'bic': model.bic,
            'condition_number': model.condition_number,
            'alpha': model.params['const'],
            'alpha_pval': model.pvalues['const'],
            'alpha_tval': model.tvalues['const'],
            'alpha_bse': model.bse['const'],
        }
        for factor in distilled_factors.columns:
            result[f'beta_{factor}'] = model.params[factor]
            result[f'pval_beta_{factor}'] = model.pvalues[factor]
            result[f'tval_beta_{factor}'] = model.tvalues[factor]
            result[f'bse_beta_{factor}'] = model.bse[factor]
        results.append(result)

    results_df = pd.DataFrame(results)
    return results_df

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data.sort_values(by='VIF', ascending=False)

def run_elastic_net(
                    factors_df,
                    pct_changes,
                    risk_free_df,
                    training_cutoff,
                    alphas=ENET_ALPHAS,
                    l1_ratio=ENET_L1_RATIOS,
                    cv=ENET_CV,
                    tol=ENET_TOL,
                    random_state=42):

    data = data = (
        factors_df.copy()
        .join(pct_changes, how='inner')
        .join(risk_free_df[['daily_nominal_rate']], how='inner')
        .fillna(0)
    )

    train = data[data.index < training_cutoff]
    test = data[data.index >= training_cutoff]

    X_train = train[factors_df.columns].values
    X_test = test[factors_df.columns].values
    
    metrics = []
    for etf in tqdm(pct_changes.columns, total=len(pct_changes.columns), desc="Elastic Net Regression"):
        Y_train = train[etf].values - train['daily_nominal_rate'].values
        Y_test = test[etf].values - test['daily_nominal_rate'].values

        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('enet', ElasticNetCV(alphas=alphas,
                                l1_ratio=l1_ratio,
                                cv=cv,
                                random_state=random_state,
                                max_iter=499999,
                                tol=tol,
                                fit_intercept=True,
                                n_jobs=-1)),
        ])

        try:
            pipeline.fit(X_train, Y_train)
        except ValueError as e:
            print(f"Skipping {etf} due to error: {e}")
            continue

        # Unscale coefficients and intercept
        enet = pipeline.named_steps['enet']
        scaler = pipeline.named_steps['scaler']
        betas_train = enet.coef_ / scaler.scale_
        intercept = enet.intercept_ - np.dot(betas_train, scaler.mean_)

        # out-of-sample stats
        er_test = pipeline.predict(X_test)

        # in-sample stats
        er_train = pipeline.predict(X_train)

        row = {
            'conId': etf,
            'jensens_alpha': intercept,
            'enet_alpha': enet.alpha_,
            'l1_ratio': enet.l1_ratio_,
            'n_iter': enet.n_iter_,
            'dual_gap': enet.dual_gap_,
            'n_nonzero': np.sum(np.abs(betas_train) > 1e-6),
            'cv_mse_best': np.min(enet.mse_path_.mean(axis=2)),
            'cv_mse_average': np.mean(enet.mse_path_.mean(axis=2)),
            'cv_mse_worst': np.max(enet.mse_path_.mean(axis=2)),
            'mse_test' : mean_squared_error(Y_test, er_test),
            'mse_train' : mean_squared_error(Y_train, er_train),
            'r2_test' : r2_score(Y_test, er_test),
            'r2_train' : r2_score(Y_train, er_train),
        }

        # Map back coefficients to factor names
        for coef, fname in zip(betas_train, factors_df.columns):
            row[f'{fname}_beta'] = coef

        metrics.append(row)
    
    results_df = pd.DataFrame(metrics).set_index('conId')
    return results_df

def optimize_scalar(series):
    def obj(s):
        if s <= 0:
            return np.inf
        return skew(np.log1p(s * series))**2

    result = minimize_scalar(obj, bounds=(1e-5, 1e20), method='bounded')
    # print(result.x)
    return result.x

In [None]:
# LOAD FILES
kind = 'trades'
price_col = 'average'
root = 'data/'
data_path = root + 'daily-trades/series/'
verified_path = root + 'daily-trades/verified_files.csv'
verified_df = load(verified_path)

scraped_path = root + f'preprocessed/justetf_{datetime.now().strftime('%Y-%m')}.csv'
scraped_df = load(scraped_path)

# Scraped just etf data
# accumulating_isins = scraped_df[(scraped_df['distribution_policy'] == 'Accumulating') | (scraped_df['distribution_policy'].isna())]['isin']
accumulating_isins = scraped_df[(scraped_df['distribution_policy'] == 'Accumulating')]['isin']
# physical_isins = scraped_df[(scraped_df['replication'] == 'Physical') | (scraped_df['replication'].isna())]['isin']
physical_isins = scraped_df[(scraped_df['replication'] == 'Physical')]['isin']

# Scraped IBKR data
fund_path = root + f'preprocessed/fundamentals_{datetime.now().strftime('%Y-%m')}.csv'
fund_df = load(fund_path)
fund_df['funds_date'] = pd.to_datetime(fund_df['funds_date'])
fund_df = fund_df[fund_df['isin'].isin(physical_isins)]
accumulating_ids = fund_df[fund_df['isin'].isin(accumulating_isins)]['conId']

tradable = fund_df[fund_df['tradable'] == 1]['conId'].to_list()
low_real_estate = fund_df[fund_df['industries_RealEstate'] < fund_df['industries_RealEstate'].mean()]['conId'].to_list()
gold_conids = fund_df[(fund_df['longName'].str.contains('gold', case=False, na=False)) | (fund_df['longName'].str.contains('auag', case=False, na=False))]['conId'].to_list()

fund_df = fund_df.drop(['tradable'], axis=1)

In [None]:
# # Check industry correlation
# import matplotlib.pyplot as plt 
# import seaborn as sns

# from scipy.cluster.hierarchy import linkage, dendrogram
# from scipy.spatial.distance import squareform

# industry_cols = [col for col in fund_df.columns if col.startswith('indust')]
# numerical_cols = [col for col in fund_df.columns if fund_df[col].dtype in [np.int64, np.float64] and col not in ['conId'] and not col.startswith('count') and not col.startswith('style')]

# ind_corr = fund_df[industry_cols].corr()
# dist_matrix = 1 - ind_corr

# condensed_dist = squareform(dist_matrix)
# linkage_matrix = linkage(condensed_dist, method='ward')

# # plt.figure(figsize=(13, 10)) 
# # sns.heatmap(ind_corr, annot=True, cmap='coolwarm', linewidths=0.5) 
# # plt.title('Correlation Heatmap')
# # plt.show()

# plt.figure(figsize=(18, 10))
# plt.title('Hierarchical Clustering Dendrogram of IBKR Industries', fontsize=20)
# plt.xlabel('Industry', fontsize=16)
# plt.ylabel('Distance (1 - Correlation)', fontsize=16)


# dn = dendrogram(
#     linkage_matrix,
#     labels=[col.replace('industries_', '') for col in dist_matrix.columns],
#     leaf_rotation=90,
# )

# plt.show()

In [None]:
# Load full historical price series
if 'meta' not in globals() or input('Reload CSVs? (y/n)').lower().strip() == 'y':
    last_date = (datetime.now() - timedelta(days=365 * 99))
    first_date = (datetime.now())
    meta = []
    file_list = os.listdir(data_path)
    for file in tqdm(file_list, total=len(file_list), desc="Loading files"):
        if not file.endswith('.csv'):
            continue
        parts = os.path.splitext(file)[0].split('-')
        symbol, exchange, currency = parts[0], parts[1], parts[2]
        if not ((verified_df['symbol'] == symbol) & (verified_df['currency'] == currency)).any():
            continue
        try:
            df = load(data_path + file)
            df = ensure_series_types(df, price_col)
            df = validate_raw_prices(df, price_col)
            df = handle_stale_periods(df, price_col)
            df['pct_change'] = df[price_col].pct_change()

            if df['date'].max() > last_date:
                last_date = df['date'].max()
            if df['date'].min() < first_date:
                first_date = df['date'].min()
            
            meta.append({
                'symbol': symbol,
                'currency': currency,
                'exchange_api': exchange,
                'df': df[['date', price_col, 'volume', 'pct_change']],
            })
        except Exception as e:
            raise Exception(f"ERROR loading {file}: {e}")

    meta = pd.DataFrame(meta)
    detect_and_nullify_global_outliers(meta, price_col=price_col, z_threshold=Z_THRESHOLD_GLOBAL_LOOP, window=OUTLIER_WINDOW_DEFAULT)

In [None]:
wb_indicator_map = {
    'NE.IMP.GNFS.ZS': 'imports-goods+serv',
    'NE.EXP.GNFS.ZS': 'exports-goods+serv',
    'TM.VAL.MRCH.XD.WD': 'import-goods',
    'TX.VAL.MRCH.XD.WD': 'export-goods',

    'BX.KLT.DINV.WD.GD.ZS': 'foreign-direct-investment',

    'NY.GDP.MKTP.CD': 'economic-output-gdp',
    'SP.POP.TOTL': 'population',
    'NY.GDP.PCAP.CD': 'gdp-pcap',
    # 'NY.GDP.DEFL.KD.ZG': 'production_price_inflation',
}

'''
'SH.TRD.VOL': 'share_trade_volume', #1
'NY.GDP.DEFL.ZS': 'production_price_inflation',

'SERVICES.GOODS.RATIO': 'services_goods_ratio' #1 #2
'BX.KLT.DINV.WD.GD.ZS': 'foreign_direct_investment',

'SP.POP.TOTL': 'population', #1 #2
'NY.GDP.PCAP.CD': 'gdp-pcap',

'NY.GDP.MKTP.CD': 'economic_output_gdp', #1 #2
'TRADE.SURPLUS': 'trade_surplus',
'''

In [None]:
def preprocess_world_bank_data(world_bank_data_full, wb_indicator_map):  
    def regress_extrapolate(row, n=3):
        y_valid = row.dropna()
        x_valid = y_valid.index.astype(int)
        if len(y_valid) < n:
            return row
        
        nan_indices = row[row.isna()].index.astype(int)
        if len(nan_indices) == 0:
            return row
        
        future_nans = nan_indices[nan_indices > x_valid.max()]
        past_nans = nan_indices[nan_indices < x_valid.min()]
        if len(future_nans) > 0:
            y_recent = y_valid.tail(n)
            x_recent = y_recent.index.astype(int)
            slope, intercept, _, _, _ = linregress(x_recent, y_recent)
            row.loc[future_nans] = slope * future_nans + intercept
        if len(past_nans) > 0:
            y_early = y_valid.head(n)
            x_early = y_early.index.astype(int)
            slope, intercept, _, _, _ = linregress(x_early, y_early)
            row.loc[past_nans] = slope * past_nans + intercept

        return row

    processed_data = world_bank_data_full.copy()
    processed_data.columns = [int(col[2:]) for col in processed_data.columns]
    data_unstacked = processed_data.unstack(level='series')
    data_unstacked = data_unstacked.swaplevel(0, 1, axis=1)
    data_unstacked.sort_index(axis=1, level=0, inplace=True)

    base_import_col = 'NE.IMP.GNFS.ZS'
    supp_import_col = 'TM.VAL.MRCH.XD.WD'
    base_export_col = 'NE.EXP.GNFS.ZS'
    supp_export_col = 'TX.VAL.MRCH.XD.WD'

    gdp_col = 'NY.GDP.MKTP.CD'
    pop_col = 'SP.POP.TOTL'
    gdp_pcap_col = 'NY.GDP.PCAP.CD'

    # new_surplus_col = 'TRADE.SURPLUS'
    new_share_trade_col = 'SH.TRD.VOL'
    # services2goods_col = 'SERVICES.GOODS.RATIO'

    # # Fill NaN values in import
    # services2goods_multiple = (data_unstacked[base_import_col] / data_unstacked[supp_import_col])
    # services2goods_multiple = services2goods_multiple.interpolate(method='akima', axis=1)
    # services2goods_multiple = services2goods_multiple.apply(lambda row: regress_extrapolate(row, n=5), axis=1)
    # services2goods_multiple = services2goods_multiple.fillna(services2goods_multiple.median())
    # import_fill = data_unstacked[supp_import_col].multiply(services2goods_multiple, axis=0)
    # data_unstacked[base_import_col] = data_unstacked[base_import_col].fillna(import_fill)
    
    # # Fill NaN values in export
    # services2goods_multiple = (data_unstacked[base_export_col] / data_unstacked[supp_export_col])
    # services2goods_multiple = services2goods_multiple.interpolate(method='akima', axis=1)
    # services2goods_multiple = services2goods_multiple.apply(lambda row: regress_extrapolate(row, n=5), axis=1)
    # services2goods_multiple = services2goods_multiple.fillna(services2goods_multiple.median())
    # export_fill = data_unstacked[supp_export_col].multiply(services2goods_multiple, axis=0)
    # data_unstacked[base_export_col] = data_unstacked[base_export_col].fillna(export_fill)

    # # Create a goods to serv ratio col
    # services2goods_multiple.columns = pd.MultiIndex.from_product([[services2goods_col], services2goods_multiple.columns])
    # data_unstacked = pd.concat([data_unstacked, services2goods_multiple], axis=1)

    # # Recalculate surplus with the now-amended import/export data.
    # trade_surplus = data_unstacked[base_export_col] - data_unstacked[base_import_col]
    # trade_surplus.columns = pd.MultiIndex.from_product([[new_surplus_col], trade_surplus.columns])
    # data_unstacked = pd.concat([data_unstacked, trade_surplus], axis=1)

    # Determine each country's contribution to total global trade.
    total_trade = data_unstacked[base_export_col] + data_unstacked[base_import_col]
    global_trade_by_year = total_trade.sum(axis=0)
    share_of_global_trade = total_trade.div(global_trade_by_year, axis=1)
    share_of_global_trade.columns = pd.MultiIndex.from_product([[new_share_trade_col], share_of_global_trade.columns])
    data_unstacked = pd.concat([data_unstacked, share_of_global_trade], axis=1)

    # Verify GDP/capita
    population = data_unstacked[pop_col]
    calculated_gdp_pcap = data_unstacked[gdp_col].div(population.where(population != 0))
    data_unstacked[gdp_pcap_col] = data_unstacked[gdp_pcap_col].fillna(calculated_gdp_pcap)

    # Adjust as a share of global gdp
    gdp = data_unstacked[gdp_col]
    global_gdp_by_year = gdp.sum(axis=0)
    share_of_global_gdp = gdp.div(global_gdp_by_year, axis=1)
    data_unstacked[gdp_col] = share_of_global_gdp

    wb_indicator_map_post = wb_indicator_map.copy()

    to_drop = [supp_import_col, supp_export_col, base_import_col, base_export_col]
    for col in to_drop:
        del wb_indicator_map_post[col]
        data_unstacked.drop(columns=[col], level=0, inplace=True, errors='ignore')

    wb_indicator_map_post = {**wb_indicator_map_post,
                        # **{new_surplus_col: 'trade_surplus',
                            **{new_share_trade_col: 'share_trade_volume',
                            # services2goods_col: 'services_goods_ratio',
                        }}

    # Interpolate and Extrapolate
    for col in wb_indicator_map_post:
        data_unstacked[col] = data_unstacked[col].interpolate(method='akima', axis=1)
        data_unstacked[col] = data_unstacked[col].apply(lambda row: regress_extrapolate(row, n=3), axis=1)
        data_unstacked[col] = data_unstacked[col].fillna(data_unstacked[col].mean())

    stacked = data_unstacked.stack(level=0, future_stack=True)
    stacked.index = stacked.index.set_names(['economy', 'series'])
    return stacked, wb_indicator_map_post


In [None]:
# Download supplementary data
if 'risk_free_df_full' not in globals() or input('Redownload supplementary data? (y/n)').lower().strip() == 'y':
    # Risk-free series calculation
    tickers = {
        'US': 'DTB3',
        'Canada': 'IR3TIB01CAM156N',
        'Germany': 'IR3TIB01DEM156N',
        'UK': 'IR3TIB01GBM156N',
        'France': 'IR3TIB01FRA156N',
    }
    bonds = {}
    failed = []
    for country, ticker in tickers.items():
        try:
            series = web.DataReader(ticker, 'fred', first_date, last_date)
            bonds[country] = series / 100.0
        except Exception:
            try:
                series = web.DataReader(ticker, 'oecd', first_date, last_date)
                bonds[country] = series / 100.0
            except Exception as oecd_err:
                failed.append(country)

    # Combine into a single DataFrame
    df_bonds = pd.concat(bonds, axis=1)
    df_bonds.columns = [c for c in tickers if c not in failed]
    df_bonds = df_bonds.interpolate(method='akima').bfill().ffill()

    risk_free_df_full = df_bonds.mean(axis=1).rename('nominal_rate')
    business_days = pd.date_range(start=first_date, end=last_date, freq='B')
    risk_free_df_full = risk_free_df_full.reindex(business_days, copy=False)

    risk_free_df_full = pd.DataFrame(risk_free_df_full)
    risk_free_df_full['daily_nominal_rate'] = risk_free_df_full['nominal_rate'] / 252

    # Get country stats
    cc = coco.CountryConverter()

    all_country_cols = [col for col in fund_df.columns if col.startswith('countries') and not col.endswith('variety')]
    standard_names = {}
    for col in all_country_cols:
        raw_name = col.replace('countries_', '').replace(' ', '')
        standard_name = cc.convert(names=raw_name, to='ISO3', not_found=None)
        if standard_name:
            standard_names[raw_name] = standard_name

    start = max(first_date, datetime(2000, 1, 1))
    raw_wb_data = fetch_world_bank_data(standard_names.values(), first_date, last_date, wb_indicator_map.keys())
    world_bank_data_full, wb_indicator_map_post = preprocess_world_bank_data(raw_wb_data, wb_indicator_map)

In [None]:
YEAR_RANGE = 3
training_oldest = last_date - timedelta(days=365 * YEAR_RANGE)

meta_window = meta.copy()
meta_window['df'] = meta['df'].apply(lambda df: df.loc[df['date'].between(training_oldest, last_date)].copy())
business_days = pd.date_range(start=training_oldest, end=last_date, freq='B')

for idx, row in meta_window.iterrows():
    df = row['df']
    merged = pd.DataFrame({'date': business_days}).merge(df, on='date', how='left')
    present = merged[price_col].notna()
    present_idx = np.flatnonzero(present)
    gaps = []
    length = len(merged)
    if present_idx.size > 0:
        if present_idx[0] > 0:
            gaps.append(present_idx[0])
        if present_idx.size > 1:
            internal_gaps = np.diff(present_idx) - 1
            gaps.extend(gap for gap in internal_gaps if gap > 0)
        if present_idx[-1] < length - 1:
            gaps.append(length - 1 - present_idx[-1])
    else:
        gaps = [length]
    gaps = np.array(gaps, dtype=int)
    gaps = gaps[gaps > 0]
    max_gap = float(gaps.max()) if gaps.size > 0 else 0.0
    std_gap = float(gaps.std()) if gaps.size > 0 else 0.0
    missing = length - present.sum()
    pct_missing = missing / length
    meta_window.at[idx, 'df'] = merged
    meta_window.at[idx, 'max_gap'] = max_gap
    meta_window.at[idx, 'missing'] = missing
    meta_window.at[idx, 'pct_missing'] = pct_missing
meta_window['max_gap_log'] = np.log1p(meta_window['max_gap'])

In [None]:
# Drop rows according to window gap stats
condition = ((meta_window['max_gap_log'] < MAX_GAP_LOG) & (meta_window['pct_missing'] < MAX_PCT_MISSING))
filtered = meta_window[condition].copy()
print(f'{len(filtered)} ETFs included')
print(f'{len(meta_window) - len(filtered)} dropped')
del meta_window

for idx, row in filtered.iterrows():
    df = row['df']
    df[price_col] = df[price_col].interpolate(method='akima', limit_direction='both')
    if df[price_col].isna().any():
        df[price_col] = df[price_col].ffill()
        df[price_col] = df[price_col].bfill()
    df['pct_change'] = df[price_col].pct_change()
    filtered.at[idx, 'df'] = df.set_index('date')

In [None]:
# Isolate to one fundamental row per conId + remove uninformative cols
training_cutoff = last_date - pd.Timedelta(days=TRAINING_PERIOD_DAYS)

before_training_end = fund_df[fund_df['funds_date'] <= training_cutoff]
if not before_training_end.empty:
    before_training_end = before_training_end.loc[before_training_end.groupby('conId')['funds_date'].idxmax()]
else:
    before_training_end = pd.DataFrame(columns=fund_df.columns)

after_training_end = fund_df[fund_df['funds_date'] > training_cutoff]
if not after_training_end.empty:
    after_training_end = after_training_end.loc[after_training_end.groupby('conId')['funds_date'].idxmin()]
else:
    after_training_end = pd.DataFrame(columns=fund_df.columns)


if not before_training_end.empty and not after_training_end.empty:
    after_training_end = after_training_end[~after_training_end['conId'].isin(before_training_end['conId'])]
spliced_fund_df = pd.concat([before_training_end, after_training_end])

filtered = pd.merge(filtered, spliced_fund_df, on=['symbol', 'currency'], how='inner').drop(['max_gap', 'missing', 'pct_missing', 'max_gap_log'], axis=1)
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in ['conId']]
pct_changes = pd.concat(
        [row['df']['pct_change'].rename(row['conId']) 
        for _, row in filtered.iterrows()], axis=1
    )

# Remove uninformative cols for market portfolios 
uninformative_cols = [col for col in numerical_cols if filtered[col].nunique(dropna=True) <= 1]
filtered = filtered.drop(columns=uninformative_cols)
filtered = filtered.dropna(axis=1, how='all')

In [None]:
# Add rate of change fundamentals
rate_fundamentals = [('EPSGrowth-1yr', 'EPSGrowth3yr', 'EPSGrowth5yr'),
                    ('ReturnonAssets1Yr', 'ReturnonAssets3Yr'),
                    ('ReturnonCapital', 'ReturnonCapital3Yr'),
                    ('ReturnonEquity1Yr', 'ReturnonEquity3Yr'),
                    ('ReturnonInvestment1Yr', 'ReturnonInvestment3Yr')]

for cols in rate_fundamentals:
    base_name = cols[0].replace('-1yr', '').replace('1Yr', '')
    slope_col = f'fundamentals_{base_name}_slope'
    if len(cols) == 3:
        col_1yr, col_3yr, col_5yr = cols
        filtered[slope_col] = calculate_slope(filtered[f'fundamentals_{col_1yr}'], filtered[f'fundamentals_{col_5yr}'], 1, 5)
        slope_1yr_3yr = calculate_slope(filtered[f'fundamentals_{col_1yr}'], filtered[f'fundamentals_{col_3yr}'], 1, 3)
        slope_3yr_5yr = calculate_slope(filtered[f'fundamentals_{col_3yr}'], filtered[f'fundamentals_{col_5yr}'], 3, 5)
        filtered[f'fundamentals_{base_name}_second_deriv'] = calculate_slope(slope_1yr_3yr, slope_3yr_5yr, 1, 3)
    elif len(cols) == 2:
        col_1yr, col_3yr = cols
        filtered[slope_col] = calculate_slope(filtered[f'fundamentals_{col_1yr}'], filtered[f'fundamentals_{col_3yr}'], 1, 3)
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in ['conId']]

In [None]:
# Return stats and split training and tests sets
momentum_cutoffs = {
    '1y':  training_cutoff - pd.Timedelta(days=MOMENTUM_PERIODS_DAYS['1y']),
    '6mo': training_cutoff - pd.Timedelta(days=MOMENTUM_PERIODS_DAYS['6mo']),
    '3mo': training_cutoff - pd.Timedelta(days=MOMENTUM_PERIODS_DAYS['3mo']),
}
risk_free_df = risk_free_df_full.loc[business_days]
return_stat_cols = ['momentum_3mo', 'momentum_6mo', 'momentum_1y', 'rs_3mo', 'rs_6mo', 'rs_1y']
filtered[return_stat_cols] = filtered['df'].apply(lambda df: get_return_stats(df, training_cutoff, momentum_cutoffs, risk_free_df))

In [None]:
# Create holding type portfolios
holding_cols = [col for col in filtered.columns if col.startswith('holding_types') and col != 'holding_types_variety'] + ['total']
portfolio_dfs = {}

for holding_col in holding_cols:
    name = holding_col.split('_')[-1]
    if holding_col == 'total':
        weight = filtered['profile_cap_usd']
    else:
        weight = (filtered['profile_cap_usd'] * filtered[holding_col])

    total_market_cap = (weight).sum()
    filtered['weight'] = weight / total_market_cap
    
    weights = filtered.set_index('conId')['weight']
    portfolio_return = pct_changes.dot(weights)
    initial_price = 1
    portfolio_price = initial_price * (1 + portfolio_return.fillna(0)).cumprod()

    portfolio_df = pd.DataFrame({
        'date': portfolio_price.index,
        price_col: portfolio_price.values,
        'pct_change': portfolio_return.values
    }).set_index('date')

    portfolio_dfs[name] = portfolio_df

filtered.drop('weight', axis=1, inplace=True)

In [None]:
# Avoid dummy trap
empty_subcategories = {
'holding_types': ['other'],
'countries': ['Unidentified'], 
'currencies': ['<NoCurrency>'],
'industries': ['NonClassifiedEquity', 'NotClassified-NonEquity'],
'top10': ['OtherAssets', 'AccountsPayable','AccountsReceivable','AccountsReceivable&Pay','AdministrationFees','CustodyFees','ManagementFees','OtherAssetsandLiabilities','OtherAssetslessLiabilities', 'OtherFees','OtherLiabilities','Tax','Tax--ManagementFees'],
'debtors': ['OTHER'],
'maturity': ['%MaturityOther'],
'debt_type': ['%QualityNotAvailable', '%QualityNotRated'],
'manual': ['asset_other']
}

dummy_trap_cols = []
for k, lst in empty_subcategories.items():
    for i in lst:
        if k == 'manual':
            dummy_trap_cols.append(i)
        else:
            dummy_trap_cols.append(f'{k}_{i}')
    
filtered = filtered.drop(columns=dummy_trap_cols, axis=1, errors='ignore')
numerical_cols = [col for col in filtered.columns if filtered[col].dtype in [np.int64, np.float64] and col not in ['conId']]

In [None]:
# Select asset types to work on
asset_types = [col for col in filtered.columns if col.startswith('asset_')]
asset_conditions = {
    asset.replace('asset_', ''): (filtered[asset] == 1)
    for asset in asset_types
}
asset_conditions['other'] = ~pd.concat(asset_conditions.values(), axis=1).any(axis=1)

exclude_assets = ['bond', 'cash']
include_assets = [asset for asset in asset_conditions.keys() if asset not in exclude_assets]
combined_condition = pd.Series(False, index=filtered.index)
for asset in include_assets:
    combined_condition |= asset_conditions[asset]

filtered_df = filtered[combined_condition]
cols_to_drop_ending_with_exclude = [
    col for col in filtered_df.columns
    if any(col.endswith(ea) for ea in exclude_assets)
]
filtered_df = filtered_df.drop(columns=cols_to_drop_ending_with_exclude)

numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]

single_value_columns = [col for col in filtered_df.columns if col in numerical_cols and filtered_df[col].nunique() == 1]
asset_cols = [col for col in filtered_df if col.startswith('asset')]
filtered_df = filtered_df.drop(columns=single_value_columns + asset_cols)
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]

pct_changes = pct_changes[filtered_df['conId']]
# del filtered

In [None]:
# Collapse country columns
metric_df = calculate_country_stats(world_bank_data_full, standard_names, last_date, window_size=3)
metric_suffixes = {
    'raw_value': '_stat',
    '1st_div': '_growth_rate',
    # '2nd_div': '_acceleration'
}
for ind_code, ind_name in wb_indicator_map_post.items():
    if ind_code in metric_df.columns.get_level_values(0):
        for metric_col, suffix in metric_suffixes.items():
            new_col_name = f'{ind_name}{suffix}'
            filtered_df[new_col_name] = 0.0

for std_name, iso_code in standard_names.items():
    country_weight_col = f'countries_{std_name}'
    if country_weight_col not in filtered_df.columns:
        continue

    if iso_code in metric_df.index:
        for ind_code, ind_name in wb_indicator_map_post.items():
            if ind_code in metric_df.columns.get_level_values(0):
                for metric_col, suffix in metric_suffixes.items():
                    value = metric_df.loc[iso_code, (ind_code, metric_col)]                    
                    if pd.isna(value):
                        value = 0.0

                    target_col = f'{ind_name}{suffix}'
                    filtered_df[target_col] += filtered_df[country_weight_col] * value

In [None]:
# Drop single unique value columns
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
single_value_columns = [col for col in numerical_cols if filtered_df[col].nunique() == 1]
filtered_df = filtered_df.drop(columns=single_value_columns, errors='ignore')
numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
single_value_columns

In [None]:
# Collapse fundamental columns
fundamental_columns = [col for col in filtered_df.columns if col.startswith('fundamentals')]

value_columns_inverted = [
    'fundamentals_Price/Book',
    'fundamentals_Price/Cash',
    'fundamentals_Price/Earnings',
    'fundamentals_Price/Sales',
]
leverage_columns_inverted = [
    'fundamentals_LTDebt/Shareholders',
    'fundamentals_TotalDebt/TotalCapital',
    'fundamentals_TotalDebt/TotalEquity',
    'fundamentals_TotalAssets/TotalEquity',
]
profitability_columns = [
    'fundamentals_ReturnonAssets1Yr',
    'fundamentals_ReturnonAssets3Yr',
    'fundamentals_ReturnonCapital',
    'fundamentals_ReturnonCapital3Yr',
    'fundamentals_ReturnonEquity1Yr',
    'fundamentals_ReturnonEquity3Yr',
    'fundamentals_ReturnonInvestment1Yr',
    'fundamentals_ReturnonInvestment3Yr',
]

columns_to_scale = value_columns_inverted + leverage_columns_inverted + profitability_columns + return_stat_cols
if any(x in filtered_df.columns for x in columns_to_scale):
    scaler = MinMaxScaler()
    filtered_df[columns_to_scale] = scaler.fit_transform(filtered_df[columns_to_scale])

    filtered_df['factor_value'] = (1 - filtered_df[value_columns_inverted]).sum(axis=1)
    filtered_df['factor_leverage'] = (1 - filtered_df[leverage_columns_inverted]).sum(axis=1)
    filtered_df['factor_profitability'] = filtered_df[profitability_columns].sum(axis=1)
    filtered_df['factor_momentum_relative_strength'] = filtered_df[return_stat_cols].sum(axis=1)

    filtered_df = filtered_df.drop(columns=columns_to_scale, errors='ignore')

In [None]:
# Collapse industry columns
defensive = ['industries_ConsumerNon-Cyclicals', 'industries_Utilities', 'industries_Healthcare', 'industries_TelecommunicationServices', 'industries_Academic&EducationalServices']
cyclical = ['industries_Technology', 'industries_ConsumerCyclicals', 'industries_Industrials', 'industries_Financials', 'industries_RealEstate']
commodities = ['industries_BasicMaterials', 'industries_Energy']

# defensive = ['industries_ConsumerNon-Cyclicals', 'industries_Financials', 'industries_TelecommunicationServices', 'industries_Healthcare']
# cyclical = ['industries_Technology', 'industries_Academic&EducationalServices', 'industries_ConsumerCyclicals']
# commodities = ['industries_Energy', 'industries_Utilities', 'industries_Industrials', 'industries_BasicMaterials', 'industries_RealEstate']

defensive_cols = [col for col in defensive if col in filtered_df.columns]
cyclical_cols = [col for col in cyclical if col in filtered_df.columns]
commodities_cols = [col for col in commodities if col in filtered_df.columns]

filtered_df['supersector_defensive'] = filtered_df[defensive_cols].sum(axis=1)
filtered_df['supersector_cyclical'] = filtered_df[cyclical_cols].sum(axis=1)
filtered_df['supersector_commodities'] = filtered_df[commodities_cols].sum(axis=1)

In [None]:
# Reorganize columns
categories = ['share', 'factor', 'holding_types', 'stats', 'momentum', 'profile', 'top10', 'population', 'msci', 'continent', 'countries', 'fundamentals', 'industries', 'supersector', 'currencies', 'debtors', 'maturity', 'debt_type', 'lipper', 'dividends', 'marketcap', 'style', 'domicile', 'asset']

numerical_cols = [col for col in filtered_df.columns if filtered_df[col].dtype in [np.int64, np.float64] and col not in ['conId']]
non_numerical = [col for col in filtered_df.columns if col not in numerical_cols]

sorted_numerical_cols = []
remaining_numerical = numerical_cols.copy()

for category in categories:
    cat_cols = [col for col in remaining_numerical if col.startswith(category)]
    
    if cat_cols:
        col_uniques = {col: filtered_df[col].nunique() for col in cat_cols}
        sorted_cat_cols = sorted(col_uniques, key=col_uniques.get, reverse=True)
        sorted_numerical_cols.extend(sorted_cat_cols)
        remaining_numerical = [col for col in remaining_numerical if col not in cat_cols]

sorted_numerical_cols.extend(remaining_numerical)
    
new_column_order = non_numerical + sorted_numerical_cols
filtered_df = filtered_df[new_column_order]

In [None]:
# Construct factors
factors_df = construct_factors(filtered_df, pct_changes, portfolio_dfs, risk_free_df, scaling_factor=FACTOR_SCALING_FACTOR)

# custom_drop = [
#     'gdp-pcap_stat_beta', 
#     'gdp-pcap_growth_rate_beta',
#     'profile_cap_usd',
#     # 'foreign-direct-investment_growth_rate_beta',
#     # 'foreign-direct-investment_stat_beta',
#     'supersector_cyclical_beta', 
#     'supersector_defensive',
#     'supersector_commodities_beta',
#     'population_stat',
#     'population_growth_rate_beta',
#     'economic-output-gdp_growth_rate_beta',
#     'share_trade_volume_growth_rate_beta',
#     'factor_momentum_relative_strength_beta',
#     'factor_profitability_beta',
#     'factor_hml',
#     'factor_value'
#     ]

# custom_drop = [c.split('_beta')[0] for c in custom_drop]
# factors_df = factors_df.drop(columns=custom_drop, errors='ignore')
# distilled_factors, drop_map = prescreen_factors(factors_df, correlation_threshold=CORRELATION_THRESHOLD)
# drop_map = merge_drop_map(drop_map)
# if drop_map:
#     display(pd.Series(drop_map))


# MANUAL FACTORS
# distilled_factors = factors_df[['factor_market_premium', 'factor_smb', 'share_trade_volume_stat','population_stat', 'supersector_defensive']].copy()
# distilled_factors = factors_df[['factor_market_premium', 'factor_smb', 'share_trade_volume_stat', 'factor_leverage', 'population_stat', 'population_growth_rate']].copy()
# distilled_factors = factors_df[['factor_market_premium', 'factor_smb', 'share_trade_volume_stat', 'factor_leverage', 'population_growth_rate', 'foreign-direct-investment_stat']].copy()
# distilled_factors = factors_df[['factor_market_premium', 'factor_smb', 'share_trade_volume_stat', 'factor_leverage', 'foreign-direct-investment_stat', 'foreign-direct-investment_growth_rate']].copy()

distilled_factors = factors_df[['factor_market_premium', 'factor_smb', 'share_trade_volume_stat', 'factor_leverage', 'foreign-direct-investment_stat']].copy()

display(calculate_vif(distilled_factors.dropna(axis=0)))


In [None]:
# ElasticNet regression
results_df = run_elastic_net(
    factors_df=distilled_factors,
    pct_changes=pct_changes,
    risk_free_df=risk_free_df,
    training_cutoff=training_cutoff,
    alphas=ENET_ALPHAS,
    l1_ratio=ENET_L1_RATIOS,
    cv=ENET_CV,
    tol=ENET_TOL,
)

In [None]:
# Post-regression factor screening
from scipy.stats import skew
from scipy.optimize import minimize_scalar

beta_cols = [col for col in results_df if col.endswith('beta')]
screening_df = pd.DataFrame(index=results_df.index)

screening_df['r2_adj_test'] = results_df['r2_test'].max() - results_df['r2_test']
screening_df['r2_adj_test'] = np.log1p(screening_df['r2_adj_test'] * optimize_scalar(screening_df['r2_adj_test']))
screening_df['r2_adj_test'] = screening_df['r2_adj_test'].max() - screening_df['r2_adj_test']

screening_df['cv_mse_std'] = results_df[['cv_mse_best','cv_mse_average','cv_mse_worst']].std(axis=1)
screening_df['cv_mse_std'] = np.log1p(screening_df['cv_mse_std'] * optimize_scalar(screening_df['cv_mse_std']))

screening_df['screening_score'] = screening_df['r2_adj_test'] / (1 + screening_df['cv_mse_std'])


display_df = pd.DataFrame({
    'mean_beta':  results_df[beta_cols].abs().mean(),
    'mean_adj_beta': results_df[beta_cols].abs().multiply(screening_df['screening_score'], axis=0).mean(),
    'non_zero_percentage': (results_df[beta_cols].abs() > 1e-6).sum() / len(results_df),
})
display_df.sort_values(by='non_zero_percentage')

In [None]:
# Factor-based ER
results_df = results_df[results_df.index.isin(tradable)]
results_df = results_df[results_df.index.isin(low_real_estate)]
results_df = results_df[results_df.index.isin(accumulating_ids)]
# results_df = results_df[~results_df.index.isin(gold_conids)]

beta_cols = [col for col in results_df.columns if col.endswith('beta')]
asset_betas = results_df[beta_cols]
asset_betas.columns = [col.replace('_beta', '') for col in beta_cols]
asset_betas = asset_betas[distilled_factors.columns]

factor_premia = distilled_factors.mean()
# factor_premia[factor_premia > 0] = 0
# factor_premia *= -1

systematic_returns = asset_betas.dot(factor_premia)
factor_based_er = results_df['jensens_alpha'] + systematic_returns

screening_df = pd.DataFrame(index=results_df.index)
screening_df['expected_return'] = factor_based_er
screening_df['expected_return'] -= screening_df['expected_return'].min()

screening_df['r2_test'] = results_df['r2_test'].max() - results_df['r2_test']
screening_df['r2_test'] = np.log1p(screening_df['r2_test'] * optimize_scalar(screening_df['r2_test']))
screening_df['r2_test'] = screening_df['r2_test'].max() - screening_df['r2_test']

screening_df['cv_mse_std'] = results_df[['cv_mse_best','cv_mse_average','cv_mse_worst']].std(axis=1)
screening_df['cv_mse_std'] = np.log1p(screening_df['cv_mse_std'] * optimize_scalar(screening_df['cv_mse_std']))
screening_df['cv_mse_std'] = screening_df['cv_mse_std'].max() - screening_df['cv_mse_std']

scaler = MinMaxScaler()
screening_df[['r2_test', 'cv_mse_std']] = scaler.fit_transform(screening_df[['r2_test', 'cv_mse_std']])
screening_df['r2_adjusted_er'] = screening_df['expected_return'] * screening_df['r2_test'] * screening_df['cv_mse_std']
screening_df['historical_er'] = pct_changes.mean()

mu_utility = screening_df['r2_adjusted_er'] 
mu_historical = pct_changes.mean()

In [None]:
# Factor-based COV
factor_cov_matrix = distilled_factors.cov()
idiosyncratic_variances = results_df['mse_train']
D = np.diag(results_df['mse_train'])

systematic_cov = asset_betas.values @ factor_cov_matrix.values @ asset_betas.values.T
S = pd.DataFrame(
    systematic_cov + D,
    index=results_df.index,
    columns=results_df.index
)

# Static Risk-free rate 
rf_rate = risk_free_df['daily_nominal_rate'].iloc[-10:].mean()

In [None]:
# Optimization functions
import cvxpy as cp
from pypfopt import base_optimizer

def portfolio_factor_dispersion(portfolio_betas, n_factors):
    mean_beta = cp.sum(portfolio_betas) / n_factors
    demeaned_betas = portfolio_betas - mean_beta
    return cp.norm(demeaned_betas, 2)

def print_portfolio_stats(weights, mu, S, asset_betas, printing=True):
    readable_weights = {}
    df = weights[weights > ZERO].sort_values(ascending=False)
    for k,v in df.items():
        row = filtered_df[filtered_df['conId'] == k]
        symbol = row['symbol'].iloc[0]
        if printing: print(f'{symbol}: {round(v*100, 2)}%')
        readable_weights[symbol] = k

    rf_rate = risk_free_df['daily_nominal_rate'].iloc[-10:].mean()
    er, volatility, sharpe_ratio = base_optimizer.portfolio_performance(weights, mu, S, risk_free_rate=rf_rate)
    final_portfolio_betas = weights @ asset_betas

    if printing:
        print(f'beta_std: {final_portfolio_betas.std()}')
        print(f'er: {er}')
        print(f'volatility: {volatility}')
        print(f'sharpe: {sharpe_ratio}\n')
    
    return readable_weights

def reduce_inputs(conids, mu_utility, mu_hist, S, asset_betas):
    mu_post = mu_utility[conids]
    mu_hist = mu_hist[conids]
    S_post = S[conids].loc[conids]
    betas_post = asset_betas.loc[conids]
    return mu_post, mu_hist, S_post, betas_post


def optimize_convex(mu, S, asset_betas, lambda_dispersion, lambda_risk, upper_bounds, factor_constraints=None, solver='CLARABEL'):
    n_assets, n_factors = asset_betas.shape
    w = cp.Variable(n_assets)
    portfolio_betas = w @ asset_betas.values
    expected_return = mu.values @ w
    portfolio_risk = cp.quad_form(w, S)
    factor_dispersion = portfolio_factor_dispersion(portfolio_betas, n_factors)
    
    objective = cp.Maximize(
        expected_return
        - factor_dispersion * lambda_dispersion
        - portfolio_risk * lambda_risk
    )
    
    constraints = [cp.sum(w) == 1, w >= 0, w <= upper_bounds]
    if factor_constraints:
        for factor, limits in factor_constraints.items():
            try:
                factor_idx = asset_betas.columns.get_loc(factor)
                if 'min' in limits:
                    constraints.append(portfolio_betas[factor_idx] >= limits['min'])
                if 'max' in limits:
                    constraints.append(portfolio_betas[factor_idx] <= limits['max'])
            except KeyError:
                print(f"Warning: Factor '{factor}' not found in asset_betas.columns. Skipping constraint.")

    problem = cp.Problem(objective, constraints)
    problem.solve(solver=solver)

    if problem.status not in ["optimal", "optimal_inaccurate"]:
        print(f"Warning: Optimal solution not found. Status: {problem.status}")
        return None

    weights = pd.Series(w.value, index=mu.index)
    return weights


def optimize_non_convex(mu, S, asset_betas, lambda_dispersion, lambda_risk, upper_bounds, max_assets, factor_constraints=None, solver='SCIP'):
    n_assets, n_factors = asset_betas.shape
    w = cp.Variable(n_assets)
    z = cp.Variable(n_assets, boolean=True)
    portfolio_betas = w @ asset_betas.values
    expected_return = mu.values @ w
    portfolio_risk = cp.quad_form(w, S)
    factor_dispersion = portfolio_factor_dispersion(portfolio_betas, n_factors)
    
    objective = cp.Maximize(
        expected_return
        - factor_dispersion * lambda_dispersion
        - portfolio_risk * lambda_risk
    )
    
    constraints = [
        cp.sum(w) == 1,
        w >= 0,
        cp.sum(z) <= max_assets,
        w <= upper_bounds * z,
    ]
    if factor_constraints:
        for factor, limits in factor_constraints.items():
            try:
                factor_idx = asset_betas.columns.get_loc(factor)
                if 'min' in limits:
                    constraints.append(portfolio_betas[factor_idx] >= limits['min'])
                if 'max' in limits:
                    constraints.append(portfolio_betas[factor_idx] <= limits['max'])
            except KeyError:
                print(f"Warning: Factor '{factor}' not found in asset_betas.columns. Skipping constraint.")

    problem = cp.Problem(objective, constraints)
    problem.solve(solver=solver)#, verbose=True)
    if problem.status not in ["optimal", "optimal_inaccurate"]:
        print(f"Warning: Optimal solution not found. Status: {problem.status}")

    weights = pd.Series(w.value, index=mu.index)
    return weights


In [None]:
# # Filtering and discrete optimizations
# lambda_risk = 1
# lambda_dispersion = 1
# upper_bounds = 1
# my_constraints = {'factor_market_premium': {'min': asset_betas['factor_market_premium'].mean()}}

# weights = optimize_convex(
#     mu=mu_utility,
#     S=S,
#     asset_betas=asset_betas,
#     lambda_risk=lambda_risk, 
#     lambda_dispersion=lambda_dispersion,
#     upper_bounds=upper_bounds,
#     factor_constraints=my_constraints
# )

# # Multiple optimizations
# first_weights = weights.sort_values(ascending=False).head(200)
# mu_post, mu_hist_post, S_post, betas_post = reduce_inputs(first_weights.index.tolist(), mu_utility, mu_historical, S, asset_betas)

# # evaluated_results = []
# test_range = range(1, 6)
# for max_assets in tqdm(test_range, total = len(test_range)):
#     post_weights = optimize_non_convex(
#         mu=mu_post,
#         S=S_post,
#         asset_betas=betas_post,
#         lambda_dispersion=lambda_dispersion,
#         lambda_risk=lambda_risk,
#         upper_bounds=upper_bounds,
#         max_assets = max_assets,
#         factor_constraints=my_constraints
#     )

#     non_zero_weights = post_weights[post_weights > ZERO]
#     num_assets = len(non_zero_weights)
#     if num_assets != max_assets:
#         break

#     er_model, std_model, sharpe_model = base_optimizer.portfolio_performance(post_weights, mu_post, S_post, risk_free_rate=rf_rate)
#     er_hist, _, sharpe_hist = base_optimizer.portfolio_performance(post_weights, mu_hist_post, S_post, risk_free_rate=rf_rate)
#     readable_weights = print_portfolio_stats(post_weights, mu_post, S_post, betas_post, printing=False)
#     final_portfolio_betas = post_weights @ betas_post

#     results_row = {
#         'sharpe_model': sharpe_model,
#         'sharpe_hist': sharpe_hist,
#         'er_model': er_model,
#         'er_hist': er_hist,
#         'volatility': std_model,
#         'factor_beta_std': final_portfolio_betas.std(),
#         'num_assets': num_assets,
#         'readable': readable_weights
#     }

#     results_row.update(non_zero_weights.to_dict())
#     evaluated_results.append(results_row)

# pd.DataFrame(evaluated_results)

In [None]:
# evaluated_ids = fund_df[(fund_df['conId'].isin([col for col in pd.DataFrame(evaluated_results).columns if isinstance(col, int)])) & (fund_df['currency'] == 'EUR')].sort_values(by=['profile_cap_usd'], ascending=[False])['conId'].tolist()

In [None]:
# Hand-picked conids
china_ids = {
    # '36BZ': 288308331,
    # 'HSTE': 460091028,
    # 'XCS6': 92975937,
    # 'L4K3': 358794154,
    'DBX9': 45160542,
    'UIC2': 478600590,
    # 'M9SV': 321415138,
    # 'UIC1': 478600587,
    # 'CASH1': 151625241,
}
us_ids = {
    # 'IWDA': 100292038,
    # 'XDWD': 163606916,
    # 'SPPW': 355720826,
    # 'RBOT': 254447324,
    # 'SMH': 458525706,
    'LGUS': 349753789,
    # 'CHIP': 354802180,
    'LGGL': 349753783,
    # 'HSUD': 430252007,
    # 'ROBO': 171977881,
    # 'WCLD': 386371817,
    # 'SKYE': 348172496,
    # 'UNCA': 276188942,
}
eur_ids = {
    'MEUD': 125902391,
    'CSSX5E': 75961307,
    # 'XESC': 59141442,
    # 'MSE': 29612106,
    # 'EXV1': 89005163,
    # 'C6E': 314449634,
    # 'EXIE': 617184924,
    # 'STW': 173940943,
    # 'STK': 173940919,
    # 'PRAE': 401624582,
    # 'STEC': 555440394,

}
gold_ids = {
    'IS0E': 239379050,
    'GDX1': 277212416,
    'GDXJ': 277212426,
}
oil_ids = {
    'IS0D': 239379033,
    'EXH1': 89005199,
    'V0IH': 624398537,
}
emerging_ids = {
    # 'HMEM': 279158242,
    'SPYM': 89384980,
    'HMAF': 135536495,
    'DEM': 181044389, # conflicts with DBX9
    'EMQQ': 336523697, # conflicts with UIC2
    # 'IFFI': 438873311,
}
japan_ids = {
    'IJPA': 80268543,
    'LCUJ': 311572503,
    'DXJF': 211920310,
}
korea_ids = {
    'IKRA': 37036642,
    'HKOR': 349836421,
    'CSKR': 79020239,
    'DBX8': 46041699,
}
# optimal_ids = {}
# for d in pd.DataFrame(evaluated_results).readable:
#     optimal_ids.update(d)

# fund_df[fund_df['conId'].isin(china_ids.values())].sort_values(by=['profile_cap_usd'], ascending=[False])[['conId', 'symbol', 'longName', 'profile_cap_usd', 'isin', 'search_exchange']]


In [None]:
# # OPTIMIZATION loop hand-picked ids
# custom_ids = (
#     list(china_ids.values())
#     + list(us_ids.values())
#     + list(eur_ids.values())
#     # + list(gold_ids.values())
#     # + list(oil_ids.values())
#     + list(emerging_ids.values())
#     # + list(japan_ids.values())
#     # + list(korea_ids.values())
#     # + list(optimal_ids.values())
#     # + evaluated_ids
#     )

# custom_ids = list(set([id for id in custom_ids if id in mu_utility.index]))
# mu_post, mu_hist_post, S_post, betas_post = reduce_inputs(custom_ids, mu_utility, mu_historical, S, asset_betas)

# lambda_risk = 1
# lambda_dispersion = 0
# upper_bounds = 1
# factor_constraints = {'factor_market_premium': {'min': asset_betas['factor_market_premium'].mean()}}

# evaluated_results = []
# test_range = range(1, 6)
# for max_assets in tqdm(test_range, total = len(test_range)):
#     post_weights = optimize_non_convex(
#         mu=mu_post,
#         S=S_post,
#         asset_betas=betas_post,
#         lambda_dispersion=lambda_dispersion,
#         lambda_risk=lambda_risk,
#         upper_bounds=upper_bounds,
#         max_assets = max_assets,
#         factor_constraints=factor_constraints
#     )

#     non_zero_weights = post_weights[post_weights > ZERO]
#     num_assets = len(non_zero_weights)
#     if num_assets != max_assets:
#         break

#     er_model, std_model, sharpe_model = base_optimizer.portfolio_performance(post_weights, mu_post, S_post, risk_free_rate=rf_rate)
#     er_hist, _, sharpe_hist = base_optimizer.portfolio_performance(post_weights, mu_hist_post, S_post, risk_free_rate=rf_rate)
#     readable_weights = print_portfolio_stats(post_weights, mu_post, S_post, betas_post, printing=False)
#     final_portfolio_betas = post_weights @ betas_post

#     # Create a dictionary with the scalar performance metrics
#     results_row = {
#         'sharpe_model': sharpe_model,
#         'sharpe_hist': sharpe_hist,
#         'er_model': er_model,
#         'er_hist': er_hist,
#         'volatility': std_model,
#         'factor_beta_std': final_portfolio_betas.std(),
#         'num_assets': num_assets,
#         'readable': readable_weights
#     }

#     results_row.update(non_zero_weights.to_dict())
#     evaluated_results.append(results_row)

# # pd.DataFrame(evaluated_results)

In [None]:
# # OPTIMIZATION loop through auto-selected ids
# lambda_risk = 1
# lambda_dispersion = 1
# upper_bounds = 1
# my_constraints = {
#     'factor_market_premium': {'min': asset_betas['factor_market_premium'].mean()}
# }

# weights = optimize_convex(
#     mu=mu_utility,
#     S=S,
#     asset_betas=asset_betas,
#     lambda_risk=lambda_risk, 
#     lambda_dispersion=lambda_dispersion,
#     upper_bounds=upper_bounds,
#     factor_constraints=my_constraints
# )

# # Multiple optimizations
# first_weights = weights.sort_values(ascending=False).head(500)
# mu_post, mu_hist_post, S_post, betas_post = reduce_inputs(first_weights.index.tolist(), mu_utility, mu_historical, S, asset_betas)

# evaluated_results = []
# test_range = range(2, 6)
# for max_assets in tqdm(test_range, total = len(test_range)):
#     post_weights = optimize_non_convex(
#         mu=mu_post,
#         S=S_post,
#         asset_betas=betas_post,
#         lambda_dispersion=lambda_dispersion,
#         lambda_risk=lambda_risk,
#         upper_bounds=upper_bounds,
#         max_assets = max_assets,
#         factor_constraints=my_constraints
#     )
#     readable_weights = print_portfolio_stats(post_weights, mu_post, S_post, betas_post)

#     num_assets = len(post_weights[post_weights > ZERO])
#     assert num_assets <= max_assets

#     er_model, std_model, sharpe_model = base_optimizer.portfolio_performance(post_weights, mu_post, S_post, risk_free_rate=rf_rate)
#     er_hist, _, sharpe_hist = base_optimizer.portfolio_performance(post_weights, mu_hist_post, S_post, risk_free_rate=rf_rate)
#     final_portfolio_betas = post_weights @ betas_post

#     evaluated_results.append({
#         'sharpe_model': sharpe_model,
#         'sharpe_hist': sharpe_hist,
#         'er_model': er_model,
#         'er_hist': er_hist,
#         'volatility': std_model,
#         'factor_beta_std': final_portfolio_betas.std(),
#         'num_assets': num_assets,
#         'weights': post_weights[post_weights > ZERO],
#         'readable': readable_weights
#     })

# pd.DataFrame(evaluated_results)

In [None]:
# OPTIMIZATION ONCE hand-picked ids
custom_ids = (
    list(china_ids.values())
    # + list(us_ids.values())
    + list(eur_ids.values())
    # + list(gold_ids.values())
    # + list(oil_ids.values())
    + list(emerging_ids.values())
    # + list(optimal_ids.values())
    )


# custom_ids = [125902391, 45160542, 478600590, 75961307]

custom_ids = list(set([id for id in custom_ids if id in mu_utility.index]))
custom_mu, custom_mu_hist, custom_S, custom_asset_betas = reduce_inputs(custom_ids, mu_utility, mu_historical, S, asset_betas)

lambda_risk = 1
lambda_dispersion = 1
upper_bounds = 1

custom_weights = optimize_convex(
    mu=custom_mu,
    S=custom_S,
    asset_betas=custom_asset_betas,
    lambda_risk=lambda_risk, 
    lambda_dispersion=lambda_dispersion,
    upper_bounds=upper_bounds,
    factor_constraints=my_constraints
)
custom_weights = optimize_non_convex(
    mu=custom_mu,
    S=custom_S,
    asset_betas=custom_asset_betas,
    lambda_dispersion=lambda_dispersion,
    lambda_risk=lambda_risk,
    upper_bounds=upper_bounds,
    max_assets = 2,
    factor_constraints=factor_constraints
)
_ = print_portfolio_stats(custom_weights, custom_mu, custom_S, custom_asset_betas)
_ = print_portfolio_stats(custom_weights, custom_mu_hist, custom_S, custom_asset_betas)
# pd.DataFrame(evaluated_results)

In [None]:
# Save rebalancing weights
final_conids = custom_weights[custom_weights > ZERO]
funds_dates = fund_df[fund_df['conId'].isin(final_conids.index)].set_index('conId')['funds_date']
final_df = pd.DataFrame([{
    'save_date': datetime.now().strftime('%Y-%m-%d'),
    'weights': final_conids.to_dict(),
    'training_oldest': training_oldest.strftime('%Y-%m-%d'),
    'last_date': last_date.strftime('%Y-%m-%d'),
    'mean_funds_date': funds_dates.mean().strftime('%Y-%m-%d'),
    'fund_date': funds_dates.dt.strftime('%Y-%m-%d').to_dict(),
    'factors': display_df.index.to_list(),
    'lambda_risk': lambda_risk,
    'lambda_dispersion': lambda_dispersion,
    'factor_constraints': factor_constraints,
 }])
try:
    temp_df = pd.read_csv('data/final_weights.csv')
    final_df = pd.concat([temp_df, final_df]).drop_duplicates(subset='save_date').dropna(axis=1, how='all')
except FileNotFoundError:
    pass
print('saving...')
final_df.to_csv('data/final_weights.csv', index=False)

final_df

In [None]:
# fund_df[(fund_df['longName'].str.contains('spdr europe heal', case=False, na=False) 
#          & fund_df['conId'].isin(tradable) 
#          & fund_df['currency'].isin(['EUR']))].sort_values(by=['profile_cap_usd'], ascending=[False])

fund_df[fund_df['conId'].isin(custom_ids)].sort_values(by=['profile_cap_usd'], ascending=[False])

In [None]:
from matplotlib import pyplot as plt

symbol = 'SPYM'

x = filtered_df[filtered_df['symbol'] == symbol]['df'].iloc[-1].index
y = filtered_df[filtered_df['symbol'] == symbol]['df'].iloc[-1].average

plt.plot(x,y)
plt.show()
filtered_df[filtered_df['symbol'] == symbol]