In [None]:
import numpy as np
import pandas as pd
import re
import ast
import pycountry
import requests
from tqdm import tqdm
import os

In [None]:
# Prep functions
def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

def save(df):
    final_df = df[df.apply(is_row_valid, axis=1)]
    final_df = clean_df(final_df)
    try:
        temp_df = load('data/contract_elaborated.csv')
        temp_df = clean_df(temp_df)
        final_df = pd.concat([final_df, temp_df]).drop_duplicates(subset=['symbol', 'exact_search', 'search_exchange', 'search_symbol'])
    except FileNotFoundError:
        pass

    # Filter out the duplicates with 'exact_search' is False
    duplicates_df = final_df[final_df.duplicated(subset='symbol', keep=False)]
    final_df = final_df.drop(duplicates_df[duplicates_df['exact_search'] == False].index)

    final_df.to_csv('data/contract_elaborated.csv', index=False)

def is_numerical(val):
    try:
        val = str(val).replace('%', '')
        float(val)
        return True
    except Exception:
        return False

def is_valid_tuple(tuple, column):
    def extract_float(value):
        match = re.match(r'[^0-9]*([0-9.,]+)', value)
        if match:
            return float(match.group(1).replace(',', ''))
        return None
    
    label, value = tuple
    if not isinstance(label, str): # keep
        # if label != None: # Comment out for more rigid filter
        return False
    if value is None:
        return True # Comment out for more rigid filter
        return False 
    if is_numerical(value):
        return True
    
    if column == 'profile':
        # if value and label:
        return True
    if column == 'fundamentals':
        if value.isupper():
            return True
    if column == 'dividends':
        if value == 'Unknown':
            return True
        extract_float_value = extract_float(value)
        if extract_float_value is not None:
            return True
    if column == 'style':
        if isinstance(value, bool):
            return True
    return False

def is_row_valid(row):
    for col in row.index:
        if isinstance(row[col], list):
            # if col == 'fundamentals':
            #     if len(row[col]) not in [4,5,21,22,   23]: #4, 5, 21, 22 are the acceptable num of fund values, 23 is for little bugs
            #         print(len(row[col]))
            #         return False
            for tuple in row[col]:
                if not is_valid_tuple(tuple, col):
                    print(tuple)
                    return False
    return True

def has_bad_multiplier(long_name):
    cleaned = long_name.replace('-', '').replace('+', '')
    for word in cleaned.split():
        if re.fullmatch(r'\d+X', word):
            if int(word[:-1]) > 1:
                return True
    return False

def get_remaining():
    contract_details = load('data/contract_details.csv')
    try:
        final_df = load('data/contract_elaborated.csv')
        final_df = final_df[final_df.apply(is_row_valid, axis=1)]

        exclusion_condition = (final_df['exchange_bug'] == True) | (final_df['exact_search'] == True) | (~final_df['profile'].isna())
        # exclusion_condition = (final_df['exchange_bug'] == True) | (final_df['exact_search'] == True)
        symbols_to_exclude = final_df[exclusion_condition]['symbol']
        remaining = contract_details[~contract_details['symbol'].isin(symbols_to_exclude)]

        # # To debug invalid rows
        # remaining = final_df.copy()
        # remaining = remaining[~remaining.apply(is_row_valid, axis=1)]
    except FileNotFoundError:
        remaining = contract_details.copy()
        
    remaining = remaining[~remaining['longName'].apply(has_bad_multiplier)]
    remaining = remaining[['symbol', 'exchange', 'primaryExchange', 'validExchanges', 'currency', 'conId', 'longName', 'stockType', 'isin']]
    return remaining

In [None]:
# Cleaning functions
def clean_labels(label, col):
    if col == 'industries':
        if isinstance(label, str):
            if label.endswith('-Discontinuedeff09/19/2020'):
                return label.split('-')[0]
        return label
    
    elif col == 'holding_types':
        if isinstance(label, str):
            if label.startswith('■'):
                return label[1:]
            elif label.startswith('1'):
                return label[1:]
        return label
    elif col == 'debtors':
        if isinstance(label, str):
            if ('（') in label:
                return label.replace('（', '(')
        return label
    elif col == 'fundamentals':
        if isinstance(label, str):
            if label == 'LTDebt/ShareholdersEquity':
                return 'LTDebt/Shareholders'
        return label
    return label
    
def correct_digit(value_str):
    try:
        digit = re.sub(r'[^\d.-]', '', value_str).strip()
        return float(digit)
    except Exception:
        return value_str

def clean_values(value_str, col):
    # print(value_str)
    if col == 'profile':
        return value_str
    if isinstance(value_str, str):
        if value_str.endswith('%'):
            return correct_digit(value_str.replace('%',''))/100
        try:
            return correct_digit(value_str)
        except Exception:
            return value_str
    return value_str

def clean_df(df):
    for col in df.columns:
        # print(col)
        df[col] = df[col].apply(evaluate_literal)
        df[col] = df[col].apply(lambda x: [(clean_labels(item[0], col), item[1]) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: [(item[0], clean_values(item[1], col)) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: sorted(x, key=lambda item: item[0] if isinstance(item, tuple) and item[0] else '') if isinstance(x, list) else x)
    return df

In [None]:
# Explode columns
contracts_df = load('data/contract_elaborated.csv')
contracts_df = clean_df(contracts_df)
contracts_df = contracts_df[contracts_df.apply(is_row_valid, axis=1)]

contracts_df['bond'] = contracts_df[['debtors', 'maturity', 'debt_type']].notna().any(axis=1).astype(bool)
contracts_df[['equity', 'cash', 'other']] = False

empty_subcategories = {
'holding_types': ['Other'],
'countries': ['Unidentified'], 
'currencies': ['<NoCurrency>'],
'industries': ['NonClassifiedEquity', 'NotClassified-NonEquity'],
'top10': ['OtherAssets', 'AccountsPayable','AccountsReceivable','AccountsReceivable&Pay','AdministrationFees','CustodyFees','ManagementFees','OtherAssetsandLiabilities','OtherAssetslessLiabilities', 'OtherFees','OtherLiabilities','Tax','Tax--ManagementFees'],
'debtors': ['OTHER'],
'maturity': ['%MaturityOther'],
'debt_type': ['%QualityNotAvailable', '%QualityNotRated'],
}

original_columns = contracts_df.columns
columns_to_explode = ['profile', 'holding_types', 'top10', 'countries', 'fundamentals', 'industries', 'currencies', 'style', 'debtors', 'maturity', 'debt_type', 'lipper', 'dividends']

columns_to_ignore = ['lipper', 'dividends']
columns_to_explode = [col for col in columns_to_explode if col not in columns_to_ignore]

non_percentage_columns = ['profile', 'style', 'lipper', 'fundamentals', 'dividends']
percentage_columns = [col for col in columns_to_explode if col not in non_percentage_columns]
for col in columns_to_explode:
    print(col)
    contracts_df[col] = contracts_df[col].fillna('[]')
    contracts_df[col] = contracts_df[col].apply(evaluate_literal)

    # Explode and create pivot_df
    contracts_df = contracts_df.explode(col)
    contracts_df[col] = contracts_df[col].apply(lambda x: (None, None) if pd.isna(x) else x)
    contracts_df[['label', 'value']] = pd.DataFrame(contracts_df[col].tolist(), index=contracts_df.index)

    pivot_df = contracts_df.pivot_table(index=contracts_df.index, columns='label', values='value', aggfunc='first')
    pivot_df.rename(columns={label: f'{col}_{label}' for label in pivot_df.columns}, inplace=True)

    # Drop unnecessary columns and align pivot_df with contracts_df
    contracts_df = contracts_df.drop(columns=[col, 'label', 'value'], axis=1).drop_duplicates(subset='conId')
    pivot_df = pivot_df.reindex(contracts_df.index)
    
    # Correct pivot_df values
    if col in percentage_columns:
        pivot_df = pivot_df.fillna(0.0).clip(lower=0)
        columns_to_drop = [f'{col}_{label}' for label in empty_subcategories[col]]

        # Scale value sum to be <= 1
        pivot_cols_sum = pivot_df.sum(axis=1)
        rows_greater_than = pivot_cols_sum > 1
        pivot_df.loc[rows_greater_than] = pivot_df.loc[rows_greater_than].div(pivot_cols_sum[rows_greater_than], axis=0)

        # Guarantee value sum == 1
        pivot_cols_sum = pivot_df.sum(axis=1)
        rows_less_than = pivot_cols_sum < 1
        missing_value = (1 - pivot_cols_sum[rows_less_than])

        empty_column = columns_to_drop[0]
        if empty_column not in pivot_df.columns:
                pivot_df[empty_column] = 0.0

        pivot_df.loc[rows_less_than, empty_column] = pivot_df.loc[rows_less_than, empty_column] + missing_value

        # Create variety columns
        pivot_df[f'{col}_variety'] = pivot_df.pow(2).sum(axis=1)

        # # To avoid multicollinearity
        # pivot_df = pivot_df.drop(columns=columns_to_drop, axis=1, errors='ignore')

        # Drop top10 company columns
        if col == 'top10':
            columns_to_drop = [column for column in pivot_df.columns if column != f'{col}_variety']
            pivot_df = pivot_df.drop(columns=columns_to_drop, axis=1)

    if col == 'dividends':
        pivot_df = pivot_df.fillna(0.0).clip(lower=0)

    contracts_df = pd.concat([contracts_df, pivot_df], axis=1)

contracts_df = contracts_df[~contracts_df['profile_TotalNetAssets'].isna()]
contracts_df = contracts_df.drop(columns=columns_to_ignore, errors='ignore')

In [None]:
# Drop ETF duplicates
eur_exchanges = contracts_df[contracts_df['currency'] == 'EUR'].primaryExchange.unique()
remaining_columns = [col for col in contracts_df.columns if col not in original_columns]
og_len = len(contracts_df)

contracts_df = (
    contracts_df
    .assign(currency_is_euro=contracts_df['currency'] == 'EUR')
    .assign(exchange_is_european=contracts_df['exchange'].isin(eur_exchanges))
    .assign(primary_is_european=contracts_df['primaryExchange'].isin(eur_exchanges))
    .sort_values(by=['currency_is_euro','exchange_is_european', 'primary_is_european', 'tradable'], ascending=[False, False, False, False])
    .drop_duplicates(subset=remaining_columns, keep='first')
    .drop(columns=['currency_is_euro', 'exchange_is_european', 'primary_is_european'])
)
og_len - len(contracts_df)

In [None]:
# Correct profile total net assets and TER
symbol_mapping = {
    '$': 'USD',    # Default to USD
    '￥': 'JPY',    # Japanese Yen
    'Rs': 'INR',
    'CNH': 'CNY',
    '€': 'EUR',    # Euro
    '¥': 'JPY',    # Alternative Yen symbol
    '£': 'GBP',    # British Pound
    'A$': 'AUD',   # Australian Dollar
    'C$': 'CAD',   # Canadian Dollar
    'HK$': 'HKD',  # Hong Kong Dollar
}

def standardize_currency(currency):
    if pd.isna(currency):
        return np.nan
    if currency in symbol_mapping:
        return symbol_mapping[currency]
    if currency == '':
        return ''
    try:
        if pycountry.currencies.get(alpha_3=currency):
            return currency
    except AttributeError:
        pass
    return currency

def clean_total_net_assets(value):
    if pd.isna(value):
        return np.nan, np.nan
    value = re.sub(r'\basof\b.*', '', value, flags=re.IGNORECASE).strip()
    match = re.match(r'([^0-9\s]+)?\s*([0-9.,]+)\s*([kKmMbB]?)', value)
    if not match:
        return np.nan, np.nan
    currency, num_str, unit = match.groups()
    currency = currency if currency else ''
    num = float(num_str.replace(',', ''))
    unit = unit.lower() if unit else ''
    if unit == 'k':
        num *= 10**3
    elif unit == 'm':
        num *= 10**6
    elif unit == 'b':
        num *= 10**9
    elif unit == 't':
        num *= 10**12
    return num, currency

def get_exchange_rates(currencies, to_currency='USD'):
    rates = {}
    valid_currencies = []
    for c in currencies:
        if pd.notna(c) and pycountry.currencies.get(alpha_3=c) is not None:
            valid_currencies.append(c)
    if not valid_currencies:
        return rates
    try:
        url = f"https://open.er-api.com/v6/latest/{to_currency}"
        response = requests.get(url)
        data = response.json()
        if 'rates' in data:
            for currency in valid_currencies:
                if currency == 'USD':
                    rates[currency] = 1.0
                elif currency in data['rates']:
                    rates[currency] = 1 / data['rates'][currency] if data['rates'][currency] != 0 else np.nan
            # print(f"Fetched rates: {rates}")
            return rates
        else:
            print(f"Error fetching rates: {data.get('error', 'Unknown error')}")
    except Exception as e:
        print(f"Exchange rate fetch failed: {e}")
    return rates

def convert_to_usd(row, rates):
    if pd.isna(row['profile_cap']) or pd.isna(row['profile_cap_currency']):
        return np.nan
    currency = row['profile_cap_currency']
    if currency in rates:
        return row['profile_cap'] * rates[currency]
    print(f"No rate available for {currency}")
    return np.nan

contracts_df[['profile_cap', 'profile_cap_currency']] = contracts_df['profile_TotalNetAssets'].apply(lambda x: pd.Series(clean_total_net_assets(x)))
contracts_df['profile_cap_currency'] = contracts_df['profile_cap_currency'].apply(standardize_currency)
contracts_df['profile_cap_currency'] = np.where(contracts_df['profile_cap_currency'] == '', contracts_df['currency'], contracts_df['profile_cap_currency'])
contracts_df['profile_cap_currency'] = contracts_df['profile_cap_currency'].apply(lambda x: x if (pd.isna(x) or pycountry.currencies.get(alpha_3=x) or x == '') else np.nan)

exchange_rates = get_exchange_rates(contracts_df['profile_cap_currency'].unique())
contracts_df['profile_cap_usd'] = contracts_df.apply(lambda row: convert_to_usd(row, exchange_rates),axis=1)
contracts_df = contracts_df.drop(columns=['profile_TotalNetAssets', 'profile_cap', 'profile_cap_currency'], axis=1, errors='ignore')

# TER
contracts_df['profile_TotalExpenseRatio'] = contracts_df['profile_TotalExpenseRatio'].replace('', np.nan).astype(float)

In [None]:
# Domicile dummies
if 'profile_Domicile' in contracts_df.columns:
    dummies = pd.get_dummies(contracts_df['profile_Domicile'], prefix='domicile').astype(int)
    contracts_df = pd.concat([contracts_df, dummies], axis=1)
    contracts_df.drop('profile_Domicile', axis=1, inplace=True)

# Market cap dummies
size_map = {
        'Small-cap': 'small',
        'Mid-cap': 'mid',
        'BroadMarket': 'multi',
        'Large-cap': 'large',
    }
contracts_df['profile_MarketCapFocus'] = contracts_df['profile_MarketCapFocus'].map(size_map)

# Get size columns from MarketCapFocus and style cols
dummies = pd.get_dummies(contracts_df['profile_MarketCapFocus'], dtype=int)
contracts_df = pd.concat([contracts_df, dummies], axis=1)

style_groups = {
    'small': ['style_small-core', 'style_small-growth', 'style_small-value'],
    'mid': ['style_mid-core', 'style_mid-growth', 'style_mid-value'],
    'large': ['style_large-core', 'style_large-growth', 'style_large-value'],
    'multi': ['style_multi-core', 'style_multi-growth', 'style_multi-value']
}

# Update each size column by OR-ing with the style columns
for size, cols in style_groups.items():
    contracts_df[size] = contracts_df[size] | contracts_df[cols].any(axis=1).astype(int)

del dummies, size_map, style_groups

In [None]:
# Search exchange verification
contracts_df['search_exchange'] = contracts_df['search_exchange'].str.extract(r'\(([^()]*)\)', expand=False)
contracts_df['validExchanges'] = contracts_df['validExchanges'].apply(lambda x: x.split(','))

def validate_search_exchange(row):
    # if pd.isna(row['search_exchange']):
    #     return np.nan
    return 1 if row['search_exchange'] in row['validExchanges'] else 0

contracts_df['valid_search_exchange'] = contracts_df.apply(validate_search_exchange, axis=1)
contracts_df = contracts_df.drop(columns=['search_symbol', 'search_exchange', 'validExchanges'], axis=1, errors='ignore')

In [None]:
# Refine original asset class classifications with fundamentals and holding_type columns
bond_fundamental_cols = ['fundamentals_AverageCoupon', 'fundamentals_AverageQuality', 'fundamentals_YieldtoMaturity', 'fundamentals_NominalMaturity', 'fundamentals_EffectiveMaturity']
equity_fundamental_cols = [col for col in contracts_df.columns if col.startswith('fundamentals_') if col not in bond_fundamental_cols]
bond_mask = contracts_df[bond_fundamental_cols].notna().any(axis=1)
equity_mask = contracts_df[equity_fundamental_cols].notna().any(axis=1)

contracts_df['bond'] = contracts_df['bond'] | bond_mask
contracts_df['other'] = ~(equity_mask | contracts_df['bond'])
contracts_df['other'] = contracts_df['other'] | (equity_mask & contracts_df['bond'])
contracts_df['equity'] = ~(contracts_df['bond'] | contracts_df['other'])

contracts_df = contracts_df.rename(columns={'holding_types_FixedIncome': 'holding_types_bond',
                                            'holding_types_Equity': 'holding_types_equity',
                                            'holding_types_Cash': 'holding_types_cash',
                                            'holding_types_Other': 'holding_types_other',
                                            })

def refine_classification(row):
    holding_types = ['holding_types_bond', 'holding_types_equity', 'holding_types_cash', 'holding_types_other']
    max_col = row[holding_types].idxmax()
    if row[max_col] > 0.5:
        type_name = max_col.replace('holding_types_', '')
        result = pd.Series([False, False, False, False], index=['bond', 'equity', 'cash', 'other'])
        result[type_name] = True
        return result
    else:
        return row[['bond', 'equity', 'cash', 'other']]

contracts_df[['bond', 'equity', 'cash', 'other']] = contracts_df.apply(refine_classification, axis=1)

In [None]:
# Final cleaning before imputation
rating_map = {
    'AAA': 6,
    'AA': 5,
    'A': 4,
    'BBB': 3,
    'BB': 2,
    'B': 1,
}
contracts_df['fundamentals_AverageQuality'] = contracts_df['fundamentals_AverageQuality'].replace(rating_map)

# Convert bools to intbools
bool_map = {
    True: 1,
    False: 0,
    # np.nan: 0,
}
bool_cols = [col for col in contracts_df.columns if contracts_df[col].dtype == 'bool' or col.startswith('style_')]
for col in list(set(bool_cols)):
    contracts_df[col] = contracts_df[col].replace(bool_map).fillna(0) ## DONT FILLNA if you want to impute 

if 'stockType' in contracts_df.columns:
    contracts_df.loc[contracts_df['stockType'] == 'ETC', 'industries_BasicMaterials'] = 1.0

# Remove unnecessary qual or empty columns, only keep key identifiers
qual_cols = ['primaryExchange', 'stockType', 'date_scraped', 'exchange_bug', 'exact_search', 'search_symbol', 'profile_MarketCapFocus', 'profile_MarketGeoFocus', 'profile_BenchmarkIndex', 'profile_FundCategory', 'dividends_PayoutRatio']
contracts_df = contracts_df.drop(columns=qual_cols, axis=1, errors='ignore')

# Clean remaining numerical columns
identifier_cols = ['symbol', 'conId', 'longName', 'isin', 'exchange', 'currency', 'profile_MarketCapFocus']
for col in [c for c in contracts_df.columns if c not in identifier_cols]:
    temp_type = contracts_df[col].dtype
    if temp_type == 'object':
        contracts_df[col] = contracts_df[col].apply(lambda x: np.nan if isinstance(x, str) else x)

# Assign corresponding fundamentals
contracts_df.loc[contracts_df['holding_types_bond'] == 0, bond_fundamental_cols] = 0
contracts_df.loc[contracts_df['holding_types_equity'] == 0, equity_fundamental_cols] = 0

contracts_df.reset_index(drop=True, inplace=True)

In [None]:
# check similarity between all top 10 columns
from fuzzywuzzy import fuzz
from itertools import combinations

columns = [col.split('top10_')[-1] for col in contracts_df.columns if col.startswith('top10_') and col not in ['top10_variety', 'top10_']]

similarity_threshold = 80
similar_pairs = []
# similarities = []
n = len(columns)

for col1, col2 in tqdm(combinations(columns, 2), total = n * (n - 1) // 2):
    similarity = fuzz.token_set_ratio(col1, col2)
    # similarities.append(similarity)
    if similarity >= similarity_threshold:
        similar_pairs.append((col1, col2, similarity))

In [None]:
# Merge similar top10
import pandas as pd
from fuzzywuzzy import fuzz
from itertools import combinations
from tqdm import tqdm
import networkx as nx

if n:
    # Find connected components 
    G = nx.Graph()
    for col1_suffix, col2_suffix, _ in similar_pairs:
        G.add_edge(col1_suffix, col2_suffix)

    for suffix in columns:
        G.add_node(suffix)

    connected_components = list(nx.connected_components(G))

    # 3. Prepare for DataFrame modification
    new_column_data_map = {}
    original_cols_involved_in_merging = set()
    for group_of_suffixes in connected_components:
        if len(group_of_suffixes) > 1:
            sorted_suffixes_in_group = sorted(list(group_of_suffixes))
            
            # Determine the new representative column name (using the 'top10_' prefix)
            representative_suffix = sorted_suffixes_in_group[0]
            merged_full_col_name = f"top10_{representative_suffix}"
            
            # Identify all original full column names in this group
            original_full_names_in_this_group = []
            for suffix in sorted_suffixes_in_group:
                full_name = f"top10_{suffix}"
                original_full_names_in_this_group.append(full_name)
                original_cols_involved_in_merging.add(full_name)
                
            # Sum the values of the original columns in this group
            existing_cols_to_sum = [col for col in original_full_names_in_this_group if col in contracts_df.columns]
            
            if existing_cols_to_sum:
                summed_series = contracts_df[existing_cols_to_sum].sum(axis=1)
                new_column_data_map[merged_full_col_name] = summed_series
            else:
                print(f"  Warning: No existing columns found for suffixes {group_of_suffixes} to sum.")
                
    # 5. Update the DataFrame
    contracts_df_merged = contracts_df.copy()

    cols_to_drop_list = list(original_cols_involved_in_merging)
    if cols_to_drop_list:
        contracts_df_merged.drop(columns=cols_to_drop_list, inplace=True, errors='ignore')

    if new_column_data_map:
        for new_col_name, data_series in new_column_data_map.items():
            contracts_df_merged[new_col_name] = data_series

    contracts_df = contracts_df_merged

    del contracts_df_merged

In [None]:
# # Impute all values at once
# from sklearn.impute import KNNImputer
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# imputed_df = contracts_df.copy()

# numerical_cols = [col for col in imputed_df.columns if imputed_df[col].dtype in [float, int] and col not in ['conId', 'valid_search_exchange']]
# pipeline = Pipeline([
#     ('scaler', MinMaxScaler()),
#     ('imputer', KNNImputer())
# ])

# transformer = ColumnTransformer(
#     transformers=[('num', pipeline, numerical_cols)],
#     remainder='passthrough'
# )
# imputed_array = transformer.fit_transform(imputed_df)

# scaler = transformer.named_transformers_['num'].named_steps['scaler']
# numerical_imputed_scaled = imputed_array[:, :len(numerical_cols)]
# numerical_imputed_original = scaler.inverse_transform(numerical_imputed_scaled)

# imputed_array[:, :len(numerical_cols)] = numerical_imputed_original

# remaining_cols = [col for col in imputed_df.columns if col not in numerical_cols]
# output_cols = numerical_cols + remaining_cols

# imputed_df = pd.DataFrame(imputed_array, columns=output_cols, index=imputed_df.index)

# for col in numerical_cols:
#     if imputed_df[col].dtype != contracts_df[col].dtype:
#         imputed_df[col] = imputed_df[col].astype(contracts_df[col].dtype)

# imputed_df = imputed_df.drop(columns=[col for col in imputed_df.columns if col.startswith('top10_') and col != 'top10_variety'], axis=1)
# imputed_df = imputed_df.drop(columns=['dividends'], errors='ignore')

# imputed_df # 12 mins

In [None]:
# Impute values by asset class
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

imputed_df = contracts_df.copy()
if input('Impute values (y/n)').lower() == 'y': 

    class_cols = ['equity', 'bond', 'cash', 'other']
    cols_to_exclude = ['conId', 'valid_search_exchange'] + class_cols
    numerical_cols_global = [
        col for col in imputed_df.columns
        if imputed_df[col].dtype in [float, np.float64, int, np.int64]
        and col not in cols_to_exclude
    ]

    for asset_class in tqdm(class_cols, total=len(class_cols)):
        mask = imputed_df[asset_class] == 1
        if not mask.any():
            continue

        subset_numerical_data = imputed_df.loc[mask, numerical_cols_global]
        if subset_numerical_data.empty:
            continue

        cols_for_imputation_in_subset = [col for col in subset_numerical_data.columns if subset_numerical_data[col].nunique(dropna=True) > 1]

        if not cols_for_imputation_in_subset:
            continue

        data_to_process = subset_numerical_data[cols_for_imputation_in_subset]
        pipeline = Pipeline([
            ('scaler', MinMaxScaler()),
            ('imputer', KNNImputer())
        ])

        imputed_scaled_subset = pipeline.fit_transform(data_to_process)
        
        scaler_fitted_on_subset = pipeline.named_steps['scaler']
        imputed_original_scale_subset = scaler_fitted_on_subset.inverse_transform(imputed_scaled_subset)

        imputed_df.loc[mask, cols_for_imputation_in_subset] = imputed_original_scale_subset

    # Restore original data types for numerical columns where possible
    for col in numerical_cols_global:
        if col in imputed_df.columns and col in contracts_df.columns:
            if imputed_df[col].dtype != contracts_df[col].dtype:
                try:
                    imputed_df[col] = imputed_df[col].astype(contracts_df[col].dtype)
                except ValueError as e:
                    print(f"Warning: Could not convert column '{col}' to {contracts_df[col].dtype}. Error: {e}")

    # Drop specified columns
    columns_to_drop = [col for col in imputed_df.columns if col.startswith('top10_') and col != 'top10_variety']
    imputed_df = imputed_df.drop(columns=columns_to_drop, errors='ignore')
    imputed_df = imputed_df.drop(columns=['dividends'], errors='ignore')

    imputed_df = imputed_df.fillna(0.0)
else:
    emptiness = imputed_df.isna().mean().copy()
    mean = emptiness.mean()
    std = emptiness.std()

    columns_to_drop = emptiness[emptiness > mean + 3*std].index.to_list()
    imputed_df = imputed_df.drop(columns=columns_to_drop, errors='ignore').dropna()

In [None]:
# Save
imputed_df.to_csv('data/fundamentals.csv', index=False)