In [None]:
import numpy as np
import pandas as pd
import re
import ast
import pycountry
import requests

In [None]:
# Prep functions
def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

def save(df):
    final_df = clean_df(df)
    try:
        temp_df = load('data/contract_elaborated.csv')
        temp_df = clean_df(temp_df)
        final_df = pd.concat([final_df, temp_df]).drop_duplicates(subset=['symbol', 'exact_search', 'search_exchange', 'search_symbol'])
    except FileNotFoundError:
        pass

    # Filter out the duplicates with 'exact_search' is False
    duplicates_df = final_df[final_df.duplicated(subset='symbol', keep=False)]
    final_df = final_df.drop(duplicates_df[duplicates_df['exact_search'] == False].index)

    final_df.to_csv('data/contract_elaborated.csv', index=False)

def is_numerical(val):
    try:
        val = str(val).replace('%', '')
        float(val)
        return True
    except Exception:
        return False

def is_valid_tuple(tuple, column):
    label, value = tuple
    if not isinstance(label, str): # keep
        # if label != None: # Comment out for more rigid filter
        return False
    if value is None:
        return True # Comment out for more rigid filter
        return False 
    if is_numerical(value):
        return True
    
    if column == 'profile':
        if value and label:
            return True
    if column == 'fundamentals':
        if value.isupper():
            return True
    if column == 'dividends':
        if value == 'Unknown':
            return True
    if column == 'style':
        if isinstance(value, bool):
            return True
    return False

def is_row_valid(row):
    for col in row.index:
        if isinstance(row[col], list):
            # if col == 'fundamentals':
            #     if len(row[col]) not in [4,5,21,22,   23]: #4, 5, 21, 22 are the acceptable num of fund values, 23 is for little bugs
            #         print(len(row[col]))
            #         return False
            for tuple in row[col]:
                if not is_valid_tuple(tuple, col):
                    return False
    return True

def has_bad_multiplier(long_name):
    cleaned = long_name.replace('-', '').replace('+', '')
    for word in cleaned.split():
        if re.fullmatch(r'\d+X', word):
            if int(word[:-1]) > 1:
                return True
    return False

def get_remaining():
    contract_details = load('data/contract_details.csv')
    try:
        final_df = load('data/contract_elaborated.csv')
        final_df = final_df[final_df.apply(is_row_valid, axis=1)]

        exclusion_condition = (final_df['exchange_bug'] == True) | (final_df['exact_search'] == True) | (~final_df['profile'].isna())
        # exclusion_condition = (final_df['exchange_bug'] == True) | (final_df['exact_search'] == True)
        symbols_to_exclude = final_df[exclusion_condition]['symbol']
        remaining = contract_details[~contract_details['symbol'].isin(symbols_to_exclude)]

        # # To debug invalid rows
        # remaining = final_df.copy()
        # remaining = remaining[~remaining.apply(is_row_valid, axis=1)]
    except FileNotFoundError:
        remaining = contract_details.copy()
        
    remaining = remaining[~remaining['longName'].apply(has_bad_multiplier)]
    remaining = remaining[['symbol', 'exchange', 'primaryExchange', 'validExchanges', 'currency', 'conId', 'longName', 'stockType', 'isin']]
    return remaining

In [None]:
# Cleaning functions
def clean_labels(label, col):
    if col == 'industries':
        if isinstance(label, str):
            if label.endswith('-Discontinuedeff09/19/2020'):
                return label.split('-')[0]
        return label
    
    elif col == 'holding_types':
        if isinstance(label, str):
            if label.startswith('■'):
                return label[1:]
            elif label.startswith('1'):
                return label[1:]
        return label
    elif col == 'debtors':
        if isinstance(label, str):
            if ('（') in label:
                return label.replace('（', '(')
        return label
    elif col == 'fundamentals':
        if isinstance(label, str):
            if label == 'LTDebt/ShareholdersEquity':
                return 'LTDebt/Shareholders'
        return label
    return label
    
# Cleaning functions
def clean_labels(label, col):
    if col == 'industries':
        if isinstance(label, str):
            if label.endswith('-Discontinuedeff09/19/2020'):
                return label.split('-')[0]
        return label
    
    elif col == 'holding_types':
        if isinstance(label, str):
            if label.startswith('■'):
                return label[1:]
            elif label.startswith('1'):
                return label[1:]
        return label
    elif col == 'debtors':
        if isinstance(label, str):
            if ('（') in label:
                return label.replace('（', '(')
        return label
    elif col == 'fundamentals':
        if isinstance(label, str):
            if label == 'LTDebt/ShareholdersEquity':
                return 'LTDebt/Shareholders'
        return label
    return label
    
def correct_digit(value_str):
    try:
        digit = re.sub(r'[^\d.-]', '', value_str).strip()
        return float(digit)
    except Exception:
        return value_str

def clean_values(value_str, col):
    if col == 'profile':
        return value_str
    if isinstance(value_str, str):
        # if 'asof' in value_str:
        #     # value_str = value_str.split('asof20')[0]
        #     # magnitude = value_str[-1].lower()
        #     # digit = correct_digit(value_str[:-1])
        #     # if magnitude == 'k':
        #     #     return digit * 10**3
        #     # if magnitude == 'm':
        #     #     return digit * 10**6
        #     # if magnitude == 'b':
        #     #     return digit * 10**9
        #     # if magnitude == 't':
        #     #     return digit * 10**12
        #     # else:
        #     return value_str
        if value_str.endswith('%'):
            return correct_digit(value_str.replace('%',''))/100
        try:
            return correct_digit(value_str)
        except Exception:
            return value_str
    return value_str

def clean_df(df):
    for col in df.columns:
        # print(col)
        df[col] = df[col].apply(evaluate_literal)
        df[col] = df[col].apply(lambda x: [(clean_labels(item[0], col), item[1]) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: [(item[0], clean_values(item[1], col)) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: sorted(x, key=lambda item: item[0] if isinstance(item, tuple) and item[0] else '') if isinstance(x, list) else x)
    return df

def clean_values(value_str, col):
    if col == 'profile':
        return value_str
    if isinstance(value_str, str):
        # if 'asof' in value_str:
        #     # value_str = value_str.split('asof20')[0]
        #     # magnitude = value_str[-1].lower()
        #     # digit = correct_digit(value_str[:-1])
        #     # if magnitude == 'k':
        #     #     return digit * 10**3
        #     # if magnitude == 'm':
        #     #     return digit * 10**6
        #     # if magnitude == 'b':
        #     #     return digit * 10**9
        #     # if magnitude == 't':
        #     #     return digit * 10**12
        #     # else:
        #     return value_str
        if value_str.endswith('%'):
            return correct_digit(value_str.replace('%',''))/100
        try:
            return correct_digit(value_str)
        except Exception:
            return value_str
    return value_str

def clean_df(df):
    for col in df.columns:
        # print(col)
        df[col] = df[col].apply(evaluate_literal)
        df[col] = df[col].apply(lambda x: [(clean_labels(item[0], col), item[1]) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: [(item[0], clean_values(item[1], col)) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: sorted(x, key=lambda item: item[0] if isinstance(item, tuple) and item[0] else '') if isinstance(x, list) else x)
    return df

In [None]:
contracts_df = load('data/contract_elaborated.csv')
contracts_df = clean_df(contracts_df)
contracts_df = contracts_df[contracts_df.apply(is_row_valid, axis=1)]
# columns_to_explode = ['profile', 'style', 'lipper', 'fundamentals', 'holding_types', 'dividends',
#                        'industries', 'countries', 'currencies', 'debtors', 'maturity', 'debt_type']#, 'top10']
columns_to_explode = ['holding_types']
for col in columns_to_explode:
    contracts_df[col] = contracts_df[col].fillna('[]')
    contracts_df[col] = contracts_df[col].apply(evaluate_literal)
    contracts_df = contracts_df.explode(col)
    contracts_df[col] = contracts_df[col].apply(lambda x: (None, None) if pd.isna(x) else x)
    contracts_df[['label', 'value']] = pd.DataFrame(contracts_df[col].tolist(), index=contracts_df.index)
    pivot_df = contracts_df.pivot_table(index=contracts_df.index, columns='label', values='value', aggfunc='first')
    pivot_df.rename(columns={label: f'{col}_{label}' for label in pivot_df.columns}, inplace=True)
    display(pivot_df)
    pivot_df[f'{col}_sum'] = pivot_df.sum(axis=1)
    display(pivot_df)

    contracts_df = contracts_df.drop(columns=[col, 'label', 'value'], axis=1).drop_duplicates(subset='symbol')
    contracts_df = pd.concat([contracts_df, pivot_df], axis=1)
# contracts_df

In [None]:
symbol_mapping = {
    '$': 'USD',    # Default to USD
    '￥': 'JPY',    # Japanese Yen
    '€': 'EUR',    # Euro
    '£': 'GBP',    # British Pound
    'A$': 'AUD',   # Australian Dollar
    'C$': 'CAD',   # Canadian Dollar
    'HK$': 'HKD',  # Hong Kong Dollar
    '¥': 'JPY',    # Alternative Yen symbol
}

def standardize_currency(currency):
    if pd.isna(currency):
        return np.nan
    if currency in symbol_mapping:
        return symbol_mapping[currency]
    if currency == '':
        return ''
    try:
        if pycountry.currencies.get(alpha_3=currency):
            return currency
    except AttributeError:
        pass
    return currency

def clean_total_net_assets(value):
    if pd.isna(value):
        return np.nan, np.nan
    value = re.sub(r'\basof\b.*', '', value, flags=re.IGNORECASE).strip()
    match = re.match(r'([^0-9\s]+)?\s*([0-9.,]+)\s*([kKmMbB]?)', value)
    if not match:
        return np.nan, np.nan
    currency, num_str, unit = match.groups()
    currency = currency if currency else ''
    num = float(num_str.replace(',', ''))
    unit = unit.lower() if unit else ''
    if unit == 'k':
        num *= 10**3
    elif unit == 'm':
        num *= 10**6
    elif unit == 'b':
        num *= 10**9
    elif unit == 't':
        num *= 10**12
    return num, currency

def get_exchange_rates(currencies, to_currency='USD'):
    rates = {}
    valid_currencies = []
    for c in currencies:
        if pd.notna(c) and pycountry.currencies.get(alpha_3=c) is not None:
            valid_currencies.append(c)
    if not valid_currencies:
        return rates
    try:
        url = f"https://api.exchangerate-api.com/v4/latest/USD"
        response = requests.get(url)
        data = response.json()
        if 'rates' in data:
            for currency in valid_currencies:
                if currency == 'USD':
                    rates[currency] = 1.0
                elif currency in data['rates']:
                    rates[currency] = 1 / data['rates'][currency] if data['rates'][currency] != 0 else np.nan
            print(f"Fetched rates: {rates}")
            return rates
        else:
            print(f"Error fetching rates: {data.get('error', 'Unknown error')}")
    except Exception as e:
        print(f"Exchange rate fetch failed: {e}")
    return rates

def convert_to_usd(row, rates):
    if pd.isna(row['profile_TotalNetAssets']) or pd.isna(row['profile_cap_currency']):
        return np.nan
    currency = row['profile_cap_currency']
    if currency in rates:
        return row['profile_TotalNetAssets'] * rates[currency]
    print(f"No rate available for {currency}")
    return np.nan

contracts_df[['profile_TotalNetAssets', 'profile_cap_currency']] = contracts_df['profile_TotalNetAssets'].apply(lambda x: pd.Series(clean_total_net_assets(x)))
contracts_df['profile_cap_currency'] = contracts_df['profile_cap_currency'].apply(standardize_currency)
contracts_df['profile_cap_currency'] = np.where(contracts_df['profile_cap_currency'] == '', contracts_df['currency'], contracts_df['profile_cap_currency'])
contracts_df['profile_cap_currency'] = contracts_df['profile_cap_currency'].apply(lambda x: x if (pd.isna(x) or pycountry.currencies.get(alpha_3=x) or x == '') else np.nan)

exchange_rates = get_exchange_rates(contracts_df['profile_cap_currency'].unique())
contracts_df['profile_TotalNetAssets_USD'] = contracts_df.apply(lambda row: convert_to_usd(row, exchange_rates),axis=1)

# contracts_df[['currency', 'profile_cap_currency', 'profile_TotalNetAssets_USD', 'profile_TotalNetAssets']]

In [None]:
classification_columns = ['symbol', 'exchange', 'primaryExchange', 'validExchanges', 'conId', 'longName', 'isin', 'date_scraped','exchange_bug', 'exact_search', 'search_exchange', 'search_symbol', 'profile_BenchmarkIndex', 'profile_FundCategory', 'profile_TotalExpenseRatio', 'dividends_PayoutRatio']

data_cols = contracts_df.columns[~contracts_df.columns.isin(classification_columns)]
contracts_df[data_cols]

In [None]:
'''

growth score = 2 * [ N_P/B + N_P/E + N_P/Cash + N_P/Sales + N_EPS_growth_1yr + N_EPS_growth_3yr + N_EPS_growth_5yr + 
              N_ReturnonAssets1Yr + N_ReturnonAssets3Yr + N_ReturnonCapital + N_ReturnonCapital3Yr + 
              N_ReturnonEquity1Yr + N_ReturnonEquity3Yr + N_ReturnonInvestment1Yr + N_ReturnonInvestment3Yr + 
              N_SalestoTotalAssets + N_EBITtoInterest + N_RelativeStrength + 
              (1 - N_LTDebt/ShareholdersEquity) + (1 - N_TotalAssets/TotalEquity) + 
              (1 - N_TotalDebt/TotalCapital) + (1 - N_TotalDebt/TotalEquity) ] / 22 - 1

Extreme Growth: If all growth indicators ≈ 1 and value indicators ≈ 0, then S = [18*1 + 4*1]/22 = 1, score = 2*1 - 1 = 1.
Extreme Value: If all growth indicators ≈ 0 and value indicators ≈ 1, then S = [18*0 + 4*0]/22 = 0, score = 2*0 - 1 = -1.
Neutral: If all ≈ 0.5, then S = [18*0.5 + 4*0.5]/22 = 0.5, score = 2*0.5 - 1 = 0.


Step 4: Proposed Refined Model
Balancing your suggestions with practicality and Morningstar’s framework, I recommend:
Select Key Metrics: Use only the most relevant IBKR metrics.
Equal Weighting Within Categories: Follow Morningstar’s approach for simplicity and grounding.
Score Calculation: Compute a value-growth spectrum from -1 to 1.
Refined Model
Value Score = mean((1 - N_P/B) + (1 - N_P/Sales) + (1 - N_P/Cash) + (1 - N_P/E)) # Possibly add: LTDebt/ShareholdersEquity, TotalDebt/Equity
Growth Score = mean(N_EPS_growth_1yr + N_EPS_growth_3yr + N_EPS_growth_5yr) # Possibly add: ReturnonAssets, SalestoTotalAssets

Why This Works
Relevance: Uses metrics tied to Morningstar’s historical measures and value investing principles.
Simplicity: Equal weighting avoids overcomplication while mirroring industry practice.
No Additional Standardization: Normalization suffices for comparability.
Flexibility: Captures the spectrum effectively with available data.

'''
