In [1]:
import pandas as pd
from time import sleep
from datetime import datetime, timedelta
import logging
import traceback
import ast
import random
from tqdm import tqdm
import pyautogui
import numpy as np
from paddleocr import PaddleOCR
from AppKit import NSWorkspace, NSApplicationActivateIgnoringOtherApps
from rapidfuzz import fuzz
import gc
import re
from PIL import Image

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
logging.disable(logging.CRITICAL)

In [2]:
# Set-up functions
def extract_text(ocr, screenshot):
    try:
        result = ocr.ocr(screenshot, cls=False)
    except Exception as e:
        print(f"Error during OCR: {e}")
        result = [None]

    text = []
    if result[0]:
        for idx in range(len(result)):
            res = result[idx]
            for line in res:
                if line[-1][-1] > 0.85:
                    text.append(line[-1][0].replace(' ',''))
    # print(result)
    return text

def scroll(amount, attempts=2):
    SCROLL_TOP = 600
    SCROLL_BOT = 550
    
    before = pyautogui.screenshot(region=(0, 25, 1919, 1054))
    before = np.array(before)
    for _ in range(attempts):
        rand_int = random.randint(SCROLL_BOT, SCROLL_TOP)
        pyautogui.scroll(amount, rand_int, rand_int)
        after = pyautogui.screenshot(region=(0, 25, 1919, 1054))
        after = np.array(after)

        if not np.array_equal(before, after):
            return
        
    # raise Exception(f'scroll() failed')
        
def wait(seconds=5, interval=0.5):
    start = datetime.now()
    timeout = timedelta(seconds=seconds)
    sleep(interval)
    while datetime.now() - start < timeout:
        screenshot = pyautogui.screenshot(region=(10, 280, 440, 200))
        screenshot = np.array(screenshot)
        screenshot = screenshot[:, :, :3]

        if np.any(np.all(screenshot == (247, 247, 247), axis=-1)):
            return
        sleep(interval)
    raise Exception('wait() text never loaded')
    
def select_tab(name, seconds=5):
    try:
        scroll(999)
        tab = pyautogui.locateCenterOnScreen(f'assets/{name}/tab_label.png', confidence=0.9)
        pyautogui.click(tab)
        wait(seconds)
        return True
    except Exception:
        return False

def clear_modals():
    while True:
        try:
            log_off_timer = pyautogui.locateOnScreen("assets/fund_setup/log_off_timer.png", confidence=0.9)
            left_x = log_off_timer.left + 34
            center_y = log_off_timer.top + log_off_timer.height // 2
            pyautogui.click(left_x, center_y)
        except Exception:
            break

    # while True:
    #     try:
    #         pyautogui.locateOnScreen("assets/fund_setup/contract_selection.png", confidence=0.9)
    #         pyautogui.press("up")
    #         pyautogui.press("enter")
    #         return True
    #     except Exception:
    #         break

    while True:
        try:
            agree = pyautogui.locateOnScreen("assets/fund_setup/agree.png", confidence=0.9)
            pyautogui.click(agree)
        except Exception:
            break
    
    while True:
        try:
            search_error = pyautogui.locateOnScreen("assets/fund_setup/search_error.png", confidence=0.9)
            left_x = search_error.left + 25
            center_y = search_error.top + search_error.height // 2
            pyautogui.click(left_x, center_y)
        except Exception:
            break

def select_exchange():
    while True:
        try:
            pyautogui.locateOnScreen("assets/fund_setup/contract_selection.png", confidence=0.9)
            pyautogui.press("up")
            pyautogui.press("enter")
            return True
        except Exception:
            break

def switch_to_app(app_name='Trader Workstation'):
    workspace = NSWorkspace.sharedWorkspace()
    apps = workspace.runningApplications()
    for app in apps:
        if app.localizedName() == app_name:
            app.activateWithOptions_(NSApplicationActivateIgnoringOtherApps)
            break

# def check_search_results(ocr, row, left, top, width=780, height=21):
#     screenshot = pyautogui.screenshot(region=(left, top, width, height))
#     screenshot = np.array(screenshot)

#     text_list = extract_text(ocr, screenshot)
#     if text_list:
#         if len(text_list) > 1:
#             if fuzz.partial_ratio(text_list[0], str(row['symbol'])) > 85:
#                 symbol = text_list[0]
#                 if fuzz.partial_ratio(row['exchange'], text_list[-1]) >= 80 or fuzz.partial_ratio(row['primaryExchange'], text_list[-1]) >= 80:
#                     exchange = text_list[-1]
#                     return (exchange, symbol, (left + (width/2), top + height/2))
#                 else:
#                     return check_search_results(ocr, row, left, top + 21 + (height - 21)/2)
#             else:
#                 return check_search_results(ocr, row, left, top + 21 + (height - 21)/2)
#         else:
#             return check_search_results(ocr, row, left, top - 3, height=height+6)

def check_search_results(ocr, row, screenshot, screenshot_left, screenshot_top, buffer, width=840):
    matches, HEIGHT, adjustable_height, max_adjustable_height, top, text_detected = [], 21, 21, 42, buffer, False

    while True:
        screenshot_array = np.array(screenshot)
        # display(Image.fromarray(screenshot_array[top:top+adjustable_height]))
        text_list = extract_text(ocr, screenshot_array[top:top+adjustable_height])
        # print(text_list, adjustable_height)

        if not text_list:
            if text_detected:
                pass
            else:
                break

        if len(text_list) > 1:
            search_symbol = text_list[0]
            row_symbol = str(row['symbol'])
            if (len(search_symbol) == len(row_symbol)) and fuzz.partial_ratio(search_symbol, row_symbol) >= 75:
                search_exchange = text_list[-1]
                position = (screenshot_left + (width / 2), (screenshot_top + top) + adjustable_height / 2)
                matches.append((search_exchange, search_symbol, position))
            # Move to the next row
            top += HEIGHT + (adjustable_height - HEIGHT)//2
            adjustable_height = HEIGHT
            text_detected = False
        else:
            # Adjust region for better OCR
            top -= 1
            adjustable_height += 2
            text_detected = True
            if adjustable_height > max_adjustable_height:
                top += HEIGHT + (adjustable_height - HEIGHT)//2
                adjustable_height = HEIGHT
                text_detected = False

    if matches:
        for match in matches:
            if fuzz.partial_ratio(row['exchange'], match[0]) >= 80:
                return match
        for match in matches:
            if fuzz.partial_ratio(row['primaryExchange'], match[0]) >= 80:
                return match
                
        valid_exchanges = row['validExchanges'].split(',') if row['validExchanges'] else []
        for match in matches:
            for valid_exchange in valid_exchanges:
                if fuzz.partial_ratio(valid_exchange.strip(), match[0]) >= 80:
                    return match
        
def prepare_search_results(buffer, width=840, seconds=8):
    start = datetime.now()
    timeout = timedelta(seconds=seconds)
    i = 0
    while datetime.now() - start < timeout:
        try:
            search = pyautogui.locateOnScreen(f'assets/fund_setup/search{i}.png', confidence=0.8)
            break
        except Exception:
            i = (i + 1) % 3
    left = search.left
    top = search.top + search.height - buffer # add some buffer for OCR region adjustments in check_search_results()
    try:
        search_bottom = pyautogui.locateOnScreen(f'assets/fund_setup/search_bottom.png', confidence=0.9)
        height = (search_bottom.top - top) + search_bottom.height + (buffer - 21)
    except Exception:
        height = 327

    screenshot = pyautogui.screenshot(region=(left, top, width, height))
    screenshot_array = np.array(screenshot)
    if screenshot_array[buffer][-1].tolist() == [64, 64, 64, 255]:
        return screenshot, left, top, buffer
    elif screenshot_array[buffer][-1].tolist() == [109, 111, 113, 255]:
        raise Exception('search_etf() did not find the correct symbol')
    else:
        return prepare_search_results(buffer, seconds=seconds)
        
def search_eft(ocr, row, wait_time=5):
    scroll(999)
    pyautogui.press("esc")
    pyautogui.click((1880,100), interval=0.2)
    pyautogui.click(positions['search_box'], interval=0.2)
    pyautogui.write(row['longName'])
    pyautogui.press("enter")

    buffer = 15
    screenshot, left, top, buffer = prepare_search_results(buffer, seconds=wait_time)
    
    exchange, symbol, search_result = check_search_results(ocr, row, screenshot, left, top, buffer)
    if search_result:
        pyautogui.click(search_result)
        pyautogui.press("enter", presses=3, interval=0.3)
        if select_exchange(): # Finding this pop-up triggers a bug for the following search, this indicates the next row to beware
            exchange_bug = True
        else:
            exchange_bug = False
        clear_modals()
        return exchange, symbol, exchange_bug
    else:
        clear_modals()
        raise Exception('search_etf() did not find the correct symbol')
    
def quick_search_etf(row, count=None, name=None):
    pyautogui.press("esc")
    pyautogui.click((1880,100), interval=0.2)
    pyautogui.click(positions['search_box'], interval=0.2)
    if count:
        pyautogui.press("delete", presses=count)
        pyautogui.press("backspace", presses=count)

    if name:
        pyautogui.write(name)
    else:
        pyautogui.write(row['symbol'])
    pyautogui.press("enter", presses=3, interval=0.3)
    if select_exchange():
        exchange_bug = True
    else:
        exchange_bug = False
    clear_modals()
    return exchange_bug

In [3]:
# Overview functions
# def process_profile(text_list):
#     headings = ['TotalExpenseRatio', 'TotalNetAssets', 'BenchmarkIndex', 'Domicile', 'MarketGeoFocus', 'MarketCapFocus', 'FundCategory']
#     current_label, current_values, labels , values = None, [], [], []

#     for item in text_list:
#         if item in headings:
#             if current_label:
#                 labels.append(current_label)
#                 values.append(''.join(current_values))
#                 current_values = []
#             current_label = item
#         else:
#             current_values.append(item)
            
#     labels.append(current_label)
#     values.append(''.join(current_values))

#     return list(zip(labels, values))

def process_profile(text_list):
    headings = ['TotalExpenseRatio', 'TotalNetAssets', 'BenchmarkIndex', 'Domicile', 'MarketGeoFocus', 'MarketCapFocus', 'FundCategory']
    current_label, current_values, labels, values = None, [], [], []
    threshold = 80

    for item in text_list:
        matches = [(heading, fuzz.partial_ratio(item, heading)) for heading in headings]
        best_match = max(matches, key=lambda x: x[1])

        if best_match[1] >= threshold and (current_label != best_match[0]):
            if current_label:
                labels.append(current_label)
                values.append(' '.join(current_values))
                current_values = []
            current_label = best_match[0]
        else:
            current_values.append(item)

    if current_label:
        labels.append(current_label)
        values.append(' '.join(current_values))

    return list(zip(labels, values))

def extract_profile(ocr):
    profile = pyautogui.locateOnScreen("assets/overview/profile.png", confidence=0.9)
    left = profile.left
    top = profile.top + profile.height
    lipper = pyautogui.locateOnScreen("assets/overview/lipper.png", confidence=0.9)
    width = 600
    height = lipper.top - top

    screenshot = pyautogui.screenshot(region=(left, top, width, height))
    screenshot = np.array(screenshot)

    text_list = extract_text(ocr, screenshot)
    scroll(-height/8)
    if text_list:
        return process_profile(text_list)
    else:
        raise Exception('skip')
    
def check_title(ocr, title, seconds=5, interval=1):
    start = datetime.now()
    timeout = timedelta(seconds=seconds)
    sleep(interval)
    while datetime.now() - start < timeout:

        # Check for white text
        screenshot = pyautogui.screenshot(region=(25, 100, 65, 25))
        screenshot = np.array(screenshot)
        text_color = (247, 247, 247, 255)
        if np.any(np.all(screenshot == text_color, axis=-1)):
            
            # Check text match
            screenshot = pyautogui.screenshot(region=(25, 100, 300, 60))
            screenshot = np.array(screenshot)
            text = extract_text(ocr, screenshot)
            if text[0].upper().startswith(title[:4]):
                return True
            else:
                raise Exception('check_title() Incorrect title')
        sleep(interval)
    raise Exception(f'check_title() title did not load in {seconds} seconds')

def check_tradable(seconds=5, interval=0.5):
    timeout = timedelta(seconds=seconds)
    start = datetime.now()
    sleep(interval)
    while datetime.now() - start < timeout:

        # Check for white text
        screenshot = pyautogui.screenshot(region=(25, 100, 65, 25))
        screenshot = np.array(screenshot)
        text_color = (247, 247, 247, 255)
        if np.any(np.all(screenshot == text_color, axis=-1)):

            # Check for nt sign
            screenshot = pyautogui.screenshot(region=(25, 125, 200, 40))
            screenshot = np.array(screenshot)
            nt_sign_color = (240, 71, 80, 255)
            if np.any(np.all(screenshot == nt_sign_color, axis=-1)):
                return False
            else:
                return True

# def process_holding_types(text_list):
#     for i, element in enumerate(text_list):
#         if element.strip().lower() == 'top3':
#             text_list = text_list[:i]

#     if text_list[0][-1] == '%':
#         tuples = [(text_list[i + 1], text_list[i]) for i in range(0, len(text_list), 2)]
#     else:
#         tuples = [(text_list[i], text_list[i + 1]) for i in range(0, len(text_list), 2)]
    
#     return tuples

def process_holding_types(text_list):
    # Assumes text is identified from left to right, and top to bottom
    for i, element in enumerate(text_list):
        if element.strip().isupper():
            text_list = text_list[:i]
            break

    last_label, labels , values = None, [], []
    for item in (text_list):
        if is_numerical(item):
            labels.append(last_label)
            values.append(item)
            last_label = None
        else:
            if last_label:
                labels.append(last_label)
                values.append(np.nan)
            last_label = item

    return list(zip(labels, values))

def extract_holding_types(ocr):
    holdings = pyautogui.locateOnScreen("assets/overview/holdings.png", confidence=0.9)
    left = holdings.left
    top = holdings.top + holdings.height
    dividends = pyautogui.locateOnScreen("assets/overview/dividends.png", confidence=0.9)
    width = 600
    height = dividends.top - top

    screenshot = pyautogui.screenshot(region=(left, top, width, height))
    screenshot = np.array(screenshot)

    text_list = extract_text(ocr, screenshot)
    scroll(-height / 8)
    if len(text_list) > 1:
        return process_holding_types(text_list)
    else:
        raise Exception('skip')

def process_dividends(text_list):
    # Assumes text is identified from left to right, and top to bottom
    labels, values, value_indicator = [], [], False
    
    for item in text_list:
        if item[-1] == '%':
            value_indicator = True
        if value_indicator:
            values.append(item)
        else:
            labels.append(item)

    return list(zip(labels, values))

def extract_dividends(ocr):
    try:
        dividends = pyautogui.locateOnScreen("assets/overview/dividends.png", confidence=0.9)
        left = dividends.left
        top = dividends.top + dividends.height
        width = 600
        height = 200

        screenshot = pyautogui.screenshot(region=(left, top, width, height))
        screenshot = np.array(screenshot)

        text_list = extract_text(ocr, screenshot)
        if 'IndustryAverage' not in text_list:
            return process_dividends(text_list)
    except Exception as e:
        raise Exception(f'extract_dividends() {e}')
    
def extract_style():
    style = pyautogui.locateOnScreen("assets/overview/style_matrix.png", confidence=0.9)
    left = style.left + style.width + 70
    top = style.top + style.height + 39
    width = 280
    height = 172

    screenshot = pyautogui.screenshot(region=(left, top, width, height))
    screenshot = np.array(screenshot)
    highlight_color = (29, 51, 88, 255)

    if np.any(np.all(screenshot == highlight_color, axis=-1)):
        styles = []
        rows = ['large', 'multi', 'mid', 'small']
        columns = ['value', 'core', 'growth']
        for i, row in enumerate(rows):
            row_step_px = round(height / (len(rows) - 1)) - 1 # First -1 to get the num of internal boundaries == num of areas - 1. Second -1 to avoid index overflow
            for j, col in enumerate(columns):
                col_step_px = round(width / (len(columns) - 1)) - 1
                pixel = screenshot[row_step_px * i][col_step_px * j].tolist()
                styles.append((f'{row}-{col}', pixel == list(highlight_color)))
    
        return styles

def process_lipper(text_index, screenshot, width, height):
    bg_color = [24, 24, 24, 255]
    missing_color = [0, 0, 0, 255]
    row_idx = 16 + round(35 * text_index)
    for j in range(5):
        col_step_px = round(width/5) - 1
        pixel = screenshot[row_idx][col_step_px * (j+1)].tolist()
        if pixel == bg_color or pixel == missing_color:
            return j + 1

def extract_lipper(ocr):
    try:
        lipper = pyautogui.locateOnScreen("assets/overview/lipper.png", confidence=0.9)
        left = lipper.left
        top = lipper.top + lipper.height
        holdings = pyautogui.locateOnScreen("assets/overview/holdings.png", confidence=0.9)
        lipper_width = 300
        height = holdings.top - top

        screenshot = pyautogui.screenshot(region=(left, top, lipper_width, height))
        screenshot = np.array(screenshot)
        text_list = extract_text(ocr, screenshot)
        if text_list:
            width = 285
            screenshot = pyautogui.screenshot(region=(left+lipper_width+24, top+12, width, height-12))
            screenshot = np.array(screenshot)
            
            lipper = []
            for i, label in enumerate(text_list):
                value = process_lipper(i, screenshot, width, 34)
                lipper.append((label, value))
            scroll(-height/10)
            return lipper
        scroll(-height/10)
    except Exception as e:
        raise Exception(f'extract_dividends() {e}')

In [4]:
# Holding functions
def show_more(type=1):
    try:
        if type != 1:
            show_more = pyautogui.locateCenterOnScreen("assets/holdings/show_more2.png", confidence=0.9)
        else:
            show_more = pyautogui.locateCenterOnScreen("assets/holdings/show_more.png", confidence=0.9)
        pyautogui.click(show_more)
    except Exception:
        pass

def process_top10(text_list):
    # Assumes text is identified from left to right, and top to bottom
    index, current_labels, labels , values = True, [], [], []

    for item in text_list:
        if index and len(item) <= 2 and not item.endswith('%'):
            index = False 
        elif is_numerical(item) and item.endswith('%'):
            labels.append('-'.join(current_labels))
            values.append(item)
            current_labels = []
            index = True
        else:
            current_labels.append(item)
            index = False 
    return list(zip(labels, values))

def extract_top10(ocr):    
    top10 = pyautogui.locateOnScreen("assets/holdings/top10.png", confidence=0.9)
    left = top10.left
    top = top10.top + top10.height
    width = 626
    height = 455

    return capture_text(ocr, process_top10, left, top, width, height)

def process_industry(text_list):
    last_value, labels , values = None, [], []
    for item in (text_list):
        if is_numerical(item):
            if last_value:
                labels.append(np.nan)
                values.append(last_value)
            last_value = item
        else:
            labels.append(item)
            values.append(last_value)
            last_value = None

    return list(zip(labels, values))

def process_holding_tables(text_list):
    last_value, labels , values = None, [], []
    for item in (text_list):
        if is_numerical(item):
            labels.append(last_value)
            values.append(item)
            last_value = None
        else:
            if last_value:
                labels.append(last_value)
                values.append(np.nan)
            last_value = item
    return list(zip(labels, values))

def extract_industry(ocr):
    show_more()
    industry = pyautogui.locateOnScreen("assets/holdings/industry.png", confidence=0.9)
    scroll(-(industry.height*3/4))

    industry = pyautogui.locateOnScreen("assets/holdings/industry.png", confidence=0.9)
    left = industry.left + 40
    top = industry.top + industry.height
    width = 550
    try:
        show_less = pyautogui.locateOnScreen("assets/holdings/show_less.png", confidence=0.9)
        height = show_less.top - top
    except Exception:
        height = 450

    return capture_text(ocr, process_industry, left, top, width, height)

def extract_country(ocr):
    country = pyautogui.locateOnScreen("assets/holdings/country.png", confidence=0.9)
    scroll(-(country.top/15))
    show_more(2)
    country = pyautogui.locateOnScreen("assets/holdings/country.png", confidence=0.9)
    left = country.left + 50
    top = country.top + country.height
    width = 460
    currency = pyautogui.locateOnScreen("assets/holdings/currency.png", confidence=0.9)
    height = currency.top - top

    return capture_text(ocr, process_holding_tables, left, top, width, height)

def extract_currency(ocr):
    currency = pyautogui.locateOnScreen("assets/holdings/currency.png", confidence=0.9)
    scroll(-(currency.top/20))
    show_more(2)
    scroll(-(currency.top/50), 1)
    currency = pyautogui.locateOnScreen("assets/holdings/currency.png", confidence=0.9)
    left = currency.left + 50
    top = currency.top + currency.height
    width = 460
    try:
        debtor = pyautogui.locateOnScreen("assets/holdings/debtor_quality.png", confidence=0.9)
        height = debtor.top - top
    except Exception:
        height = BOTTOM - top

    return capture_text(ocr, process_holding_tables, left, top, width, height)

# Resize function
def capture_text(ocr, function, left, top, width, height):
    screenshot = pyautogui.screenshot(region=(left, top, width, height))
    screenshot = np.array(screenshot)
    text_list = extract_text(ocr, screenshot)
    if text_list:
        try:
            return function(text_list)
        except:
            expand_px = 4
            screenshot = screenshot = pyautogui.screenshot(region=(left - expand_px, top - expand_px, width + expand_px*2, height + expand_px*2))
            screenshot = np.array(screenshot)
            text_list = extract_text(ocr, screenshot)
            if text_list:
                return function(text_list)

In [5]:
# Bond functions
def extract_debtors(ocr, name):
    scroll(-99, 1)
    debtor = pyautogui.locateOnScreen(f"assets/holdings/{name}.png", confidence=0.9)
    left = debtor.left
    top = debtor.top + debtor.height
    width = 405
    height = BOTTOM - top

    return capture_text(ocr, process_holding_tables, left, top, width, height)

In [6]:
# Fundamentals functions
def extract_fundamentals_text(ocr, screenshot):
    # display(Image.fromarray(screenshot))
    try:
        results = ocr.ocr(screenshot, cls=False)
    except Exception as e:
        print(f"Error during OCR: {e}")
        results = [None]

    text_list = []
    if results and results[0]:
        for res in results:
            for line in res:
                bbox, (text, conf) = line[0], line[-1]
                if conf > 0.85:
                    text_list.append({
                        'text': text.replace(' ', ''),
                        'bbox': bbox,
                        'conf': conf
                    })
    return text_list


def calculate_value_crop(label_bbox, screenshot, offset=5, expansion=0):
    """
    Given a label's bounding box, calculates a region where its value should be.
    The region is expanded by 'expansion' pixels on all sides.
    """
    # Extract coordinates from label_bbox (assumed to be list of 4 points)
    x_coords = [pt[0] for pt in label_bbox]
    y_coords = [pt[1] for pt in label_bbox]
    label_left, label_top = min(x_coords), min(y_coords)
    label_right, label_bottom = max(x_coords), max(y_coords)
    
    # Initial value region: to the right of the label with a small offset.
    initial_value_left = label_right + offset
    initial_value_top = label_top
    screenshot_width = screenshot.shape[1]
    initial_value_width = screenshot_width - initial_value_left
    initial_value_height = label_bottom - label_top

    # Expand the region by 'expansion' pixels on all sides.
    new_x = max(initial_value_left, 0)
    new_y = max(initial_value_top - expansion, 0)
    new_width = initial_value_width
    new_height = initial_value_height + 2 * expansion

    # Ensure the region stays within the screenshot boundaries.
    if new_x + new_width > screenshot.shape[1]:
        new_width = screenshot.shape[1] - new_x
    if new_y + new_height > screenshot.shape[0]:
        new_height = screenshot.shape[0] - new_y

    return (int(new_x), int(new_y), int(new_width), int(new_height))


def detect_value_with_expansion(ocr, screenshot, label_bbox, initial_offset=5, max_expansion=49):
    """
    Attempts to detect a value by progressively expanding the crop region.
    Returns the first detected text or None if no detection is made within max_expansion.
    """
    expansion = 0
    new_det = []
    while not new_det and expansion <= max_expansion:
        crop_region = calculate_value_crop(label_bbox, screenshot, offset=initial_offset, expansion=expansion)
        cropped = screenshot[crop_region[1]:crop_region[1]+crop_region[3],
                             crop_region[0]:crop_region[0]+crop_region[2]]
        new_det = extract_fundamentals_text(ocr, cropped)
        if new_det:
            break
        expansion += 1
    return new_det[0]['text'] if new_det else None


def process_fundamentals(detections, screenshot, ocr):
    last_label, last_bbox = None, None
    labels, values = [], []
    
    for det in detections:
        text = det['text']
        bbox = det['bbox']
        if is_numerical(text) or text.isupper():
            if last_label is not None:
                labels.append(last_label)
                values.append(text)
                last_label = None  # reset after pairing
        else:
            if text == 'Equity':
                continue
            elif last_label:
                labels.append(last_label)
                new_value = detect_value_with_expansion(ocr, screenshot, last_bbox, initial_offset=5)
                values.append(new_value)
            last_label = text
            last_bbox = bbox
    
    # Handle a leftover label.
    if last_label and last_label != 'Equity':
        labels.append(last_label)
        new_value = detect_value_with_expansion(ocr, screenshot, last_bbox, initial_offset=5)
        values.append(new_value)

    return list(zip(labels, values))


def extract_fundamentals(ocr, prev_list=None):
    try:
        top_screenshot_boundary = pyautogui.locateOnScreen("assets/fundamentals/metric.png", confidence=0.9)
    except pyautogui.ImageNotFoundException:
        top_screenshot_boundary = pyautogui.locateOnScreen("assets/fundamentals/top_border.png", confidence=0.9)
    left = top_screenshot_boundary.left
    top = top_screenshot_boundary.top + top_screenshot_boundary.height
    width = 550
    height = BOTTOM - top

    screenshot = pyautogui.screenshot(region=(left, top, width, height))
    screenshot = np.array(screenshot)
    
    text_list = extract_fundamentals_text(ocr, screenshot)
    if text_list:
        if len(text_list) <= 15:
            return process_fundamentals(text_list, screenshot, ocr)
        else:
            current_list = process_fundamentals(text_list, screenshot, ocr)
            if prev_list:
                if set(current_list) == set(prev_list):
                    return current_list
                return list(set(current_list + prev_list))
            scroll(-999)
            return extract_fundamentals(ocr, current_list)
    raise Exception('extract_fundamentals() error')

In [7]:
# Cleaning functions
def clean_labels(label, col):
    if col == 'industries':
        if isinstance(label, str):
            if label.endswith('-Discontinuedeff09/19/2020'):
                return label.split('-')[0]
        return label
    
    elif col == 'holding_types':
        if isinstance(label, str):
            if label.startswith('■'):
                return label[1:]
            elif label.startswith('1'):
                return label[1:]
        return label
    elif col == 'debtors':
        if isinstance(label, str):
            if ('（') in label:
                return label.replace('（', '(')
        return label
    elif col == 'fundamentals':
        if isinstance(label, str):
            if label == 'LTDebt/ShareholdersEquity':
                return 'LTDebt/Shareholders'
        return label
    return label
    
def correct_digit(value_str):
    try:
        digit = digit = re.sub(r'[^\d.]', '', value_str).strip()
        return float(digit)
    except Exception:
        return value_str

def clean_values(value_str):
    if isinstance(value_str, str):
        if 'of20' in value_str:
            value_str = value_str.split('asof20')[0]
            magnitude = value_str[-1].lower()
            digit = correct_digit(value_str[:-1])
            if magnitude == 'k':
                return digit * 10**3
            if magnitude == 'm':
                return digit * 10**6
            if magnitude == 'b':
                return digit * 10**9
            if magnitude == 't':
                return digit * 10**12
            else:
                return value_str
        elif '%' in value_str:
            return correct_digit(value_str.replace('%',''))/100
        try:
            return correct_digit(value_str)
        except Exception:
            return value_str
    return value_str

def clean_df(df):
    for col in df.columns:
        # print(col)
        df[col] = df[col].apply(evaluate_literal)
        df[col] = df[col].apply(lambda x: [(clean_labels(item[0], col), item[1]) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: [(item[0], clean_values(item[1])) if isinstance(item, tuple) and len(item) == 2 else item for item in x] if isinstance(x, list) else x)
        df[col] = df[col].apply(lambda x: sorted(x, key=lambda item: item[0] if isinstance(item, tuple) and item[0] else '') if isinstance(x, list) else x)
    return df

In [10]:
def main(remaining, dfs, wait_time):
    exchange_bug = False

    # for _, row in tqdm(remaining.sort_values(by='conId').iloc[::-1].iterrows(), total=len(remaining)):
    # for _, row in tqdm(remaining.sort_values(by='conId').iterrows(),  total=len(remaining)):
    for _, row in tqdm(remaining.iloc[::-1].iterrows(), total=len(remaining)):
    # for _, row in tqdm(remaining.iterrows(), total=len(remaining)):
        ocr = PaddleOCR()
        profile, tradable, holding_types, dividends, top10, industries, countries, currencies, debtors, maturity, debt_type, fundamentals, lipper, style= None, None, None, None, None, None, None, None, None, None, None, None, None, None

        try:
            try:
                if exchange_bug:
                    search_exchange, search_symbol, exchange_bug = search_eft(ocr, row, 1)
                else:
                    search_exchange, search_symbol, exchange_bug = search_eft(ocr, row, wait_time)
                exact_search = True
            except Exception as e:
                if e.args and len(e.args) > 0 and e.args[0] == 'PyAutoGUI fail-safe triggered from mouse moving to a corner of the screen. To disable this fail-safe, set pyautogui.FAILSAFE to False. DISABLING FAIL-SAFE IS NOT RECOMMENDED.':
                    raise Exception('manual')
                continue
                exchange_bug = quick_search_etf(row)
                # if exchange_bug:
                #     exchange_bug = quick_search_etf(row, len(remaining['longName'].max()))
                # else:
                #     exchange_bug = quick_search_etf(row, 5)
                exact_search, search_exchange, search_symbol = False, None, None

            # Overview
            # select_tab('overview', 1)
            tradable = check_tradable()
            style = extract_style()
            profile = extract_profile(ocr)
            lipper = extract_lipper(ocr)
            holding_types = extract_holding_types(ocr)
            dividends = extract_dividends(ocr)

            # Holdings tab
            if select_tab('holdings', wait_time):
                top10 = extract_top10(ocr)
                industries = extract_industry(ocr)
                countries = extract_country(ocr)
                currencies = extract_currency(ocr)

            # Bond data
            try:
                debtors = extract_debtors(ocr, 'debtor_quality')
                maturity = extract_debtors(ocr, 'maturity')
                debt_type = extract_debtors(ocr, 'debt_type')
            except Exception:
                pass

            # Ratios and Fundamentals tab
            if select_tab('fundamentals', wait_time):
                fundamentals = extract_fundamentals(ocr)

        except Exception as e:
            if exchange_bug or e.args and len(e.args) > 0 and e.args[0] == 'skip':
                pass
            elif e.args and len(e.args) > 0 and e.args[0] == 'PyAutoGUI fail-safe triggered from mouse moving to a corner of the screen. To disable this fail-safe, set pyautogui.FAILSAFE to False. DISABLING FAIL-SAFE IS NOT RECOMMENDED.':
                print(e)
                raise Exception('manual')
            elif e.args and len(e.args) > 0 and e.args[0] == 'manual':
                raise Exception('manual')
            else:
                traceback.print_exc()
                print(f'\nmain() {e} - Symbol: {row["symbol"]} - Name: {row["longName"]} - Exchange: {row["exchange"]}\n')
                return

        df = pd.DataFrame({
            'date_scraped': [datetime.now().strftime('%Y-%m-%d')],
            'exchange_bug': [exchange_bug],
            'exact_search': [exact_search],
            'search_exchange': [search_exchange],
            'search_symbol': [search_symbol],
            'tradable': [tradable],
            'profile': [profile],
            'style': [style],
            'lipper': [lipper],
            'fundamentals': [fundamentals],
            'holding_types': [holding_types],
            'dividends': [dividends],
            'top10': [top10],
            'industries': [industries],
            'countries': [countries],
            'currencies': [currencies],
            'debtors': [debtors],
            'maturity': [maturity],
            'debt_type': [debt_type],
        })

        row = pd.DataFrame(row).T.reset_index(drop=True)
        df = pd.concat([row, df], axis=1)
        dfs.append(df)
        gc.collect()
        if exchange_bug:
            raise Exception(f'bug found')

def evaluate_literal(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val
    
def load(path):
    df = pd.read_csv(path)
    for col in df.columns:
        df[col] = df[col].apply(evaluate_literal)
    return df

def save(dfs):
    final_df = pd.concat(dfs)
    final_df = clean_df(final_df)
    try:
        temp_df = load('data/contract_elaborated.csv')
        temp_df = clean_df(temp_df)
        final_df = pd.concat([final_df, temp_df]).drop_duplicates(subset=['symbol', 'exact_search', 'search_exchange', 'search_symbol'])
    except FileNotFoundError:
        pass

    # Filter out the duplicates with 'exact_search' is False
    duplicates_df = final_df[final_df.duplicated(subset='symbol', keep=False)]
    final_df = final_df.drop(duplicates_df[duplicates_df['exact_search'] == False].index)

    final_df.to_csv('data/contract_elaborated.csv', index=False)

def is_numerical(val):
    try:
        val = str(val).replace('%', '')
        float(val)
        return True
    except Exception:
        return False

def is_valid_tuple(tuple, column):
    label, value = tuple
    if not isinstance(label, str): # keep
        # if label != None: # Comment out for more rigid filter
        return False
    if value is None:
        return True # Comment out for more rigid filter
        return False 
    if is_numerical(value):
        return True
    
    if column == 'profile':
        if value and label:
            return True
    if column == 'fundamentals':
        if value.isupper():
            return True
    if column == 'dividends':
        if value == 'Unknown':
            return True
    if column == 'style':
        if isinstance(value, bool):
            return True
    return False

def is_row_valid(row):
    for col in row.index:
        if isinstance(row[col], list):
            # if col == 'fundamentals':
            #     if len(row[col]) not in [4,5,21,22,   23]: #4, 5, 21, 22 are the acceptable num of fund values, 23 is for little bugs
            #         print(len(row[col]))
            #         return False
            for tuple in row[col]:
                if not is_valid_tuple(tuple, col):
                    return False
    return True

def has_bad_multiplier(long_name):
    cleaned = long_name.replace('-', '').replace('+', '')
    for word in cleaned.split():
        if re.fullmatch(r'\d+X', word):
            if int(word[:-1]) > 1:
                return True
    return False

def get_remaining():
    contract_details = load('data/contract_details.csv') 
    try:
        final_df = load('data/contract_elaborated.csv')
        final_df = final_df[final_df.apply(is_row_valid, axis=1)]

        exclusion_condition = (final_df['exchange_bug'] == True) | (final_df['exact_search'] == True) | (~final_df['profile'].isna())
        # exclusion_condition = (final_df['exchange_bug'] == True) | (final_df['exact_search'] == True)
        symbols_to_exclude = final_df[exclusion_condition]['symbol']
        remaining = contract_details[~contract_details['symbol'].isin(symbols_to_exclude)]

        # # To debug invalid rows
        # remaining = final_df.copy()
        # remaining = remaining[~remaining.apply(is_row_valid, axis=1)]
    except FileNotFoundError:
        remaining = contract_details.copy()
        
    remaining = remaining[~remaining['longName'].apply(has_bad_multiplier)]
    remaining = remaining[['symbol', 'exchange', 'primaryExchange', 'validExchanges', 'currency', 'conId', 'longName', 'stockType', 'isin']]
    return remaining

In [None]:
# Python 3.12.8 brew

---
### Run main
---
1. Start Trader Workstation
2. Set custom font size to 18 in settings
3. Restart TW and open fundamental explorer
4. Type in and load any instrument
5. Minimize and maximize fundamental explorer window to fill the window width.
6. Set keyboard input to qwerty US
7. Now you can run the following

In [11]:
backup = []

In [13]:
# Run MAIN
while True:
    try:
        switch_to_app()
        # ocr = PaddleOCR()
        wait_time = 6
        BOTTOM = 1070
        
        positions = {
            'file': (82, 44),
            'file_fund_option': (143, 120),
            'maximize': (51, 40),
            'search_box': (100, 45),
        }
        dfs = []
        remaining = get_remaining()
        main(remaining, dfs, wait_time)
        save(dfs)
        
    except Exception as e:
        traceback.print_exc()
        if dfs:
            backup += dfs
        save(dfs)

        if e.args and len(e.args) > 0 and e.args[0] == 'bug found':
            print('bug found')
            break
        if e.args and len(e.args) > 0 and e.args[0] == 'manual':
            print('manual')
            break

  1%|          | 69/12030 [46:38<134:44:37, 40.55s/it]
Traceback (most recent call last):
  File "/var/folders/lc/8ly6wjg14dx9xm7yjml4w3z80000gn/T/ipykernel_65734/1739163630.py", line 16, in main
    search_exchange, search_symbol, exchange_bug = search_eft(ocr, row, wait_time)
                                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/lc/8ly6wjg14dx9xm7yjml4w3z80000gn/T/ipykernel_65734/2287154095.py", line 214, in search_eft
    pyautogui.write(row['longName'])
  File "/Users/alex/Documents/pystocks/venv/lib/python3.12/site-packages/pyautogui/__init__.py", line 594, in wrapper
    returnVal = wrappedFunction(*args, **kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/alex/Documents/pystocks/venv/lib/python3.12/site-packages/pyautogui/__init__.py", line 1687, in typewrite
    failSafeCheck()
  File "/Users/alex/Documents/pystocks/venv/lib/python3.12/site-packages/pyautogui/__init__.py", line 1734, in failSafeCheck
    ra

manual


---
### Clean and explode list columns
---

In [49]:
contracts_df = load('data/contract_elaborated.csv')
contracts_df = clean_df(contracts_df)
contracts_df = contracts_df[contracts_df.apply(is_row_valid, axis=1)]
# contracts_df = contracts_df[contracts_df['exact_search'] == False]
# contracts_df = contracts_df[~contracts_df['symbol'].isin(bad_symbols)]
contracts_df#.to_csv('data/contract_elaborated.csv', index=False)

Unnamed: 0,symbol,exchange,primaryExchange,validExchanges,currency,conId,longName,stockType,isin,date_scraped,exchange_bug,exact_search,search_exchange,search_symbol,tradable,profile,style,lipper,fundamentals,holding_types,dividends,top10,industries,countries,currencies,debtors,maturity,debt_type
0,159828,SEHKSZSE,SEHKSZSE,SEHKSZSE,CNH,728761081.0,GUOTAI CSI MED SERVICE ETF,ETF,CNE100005S04,2025-02-27,0.0,True,GUOTAICSIMEDSERVICEETF(SEHKSZSE),159828,True,"[(Domicile, China), (MarketCapFocus, BroadMarket), (MarketGeoFocus, China), (TotalNetAssets, CNY2.19BaSof2024/12/31)]","[(large-core, False), (large-growth, True), (large-value, False), (mid-core, False), (mid-growth, False), (mid-value, False), (multi-core, False), (multi-growth, True), (multi-value, False), (small-core, False), (small-growth, False), (small-value, False)]","[(ConsistentReturn, 1), (Preservation, 1), (TotalReturn, 1)]","[(EBITtoInterest, 704.62), (EPSGrowth-1yr, 9.17), (EPS_growth_3yr, 19.07), (EPS_growth_5yr, 27.14), (LTDebt/Shareholders, 0.1), (Price/Book, 4.46), (Price/Cash, 23.77), (Price/Earnings, 31.12), (Price/Sales, 7.41), (RelativeStrength, 4.91), (ReturnonAssets1Yr, 11.91), (ReturnonAssets3Yr, 15.12), (ReturnonCapital, 15.15), (ReturnonCapital3Yr, 19.0), (ReturnonEquity1Yr, 15.96), (ReturnonEquity3Yr, 19.08), (ReturnonInvestment1Yr, 14.55), (ReturnonInvestment3Yr, 18.41), (SalestoTotalAssets, 0.5), (TotalAssets/TotalEquity, 1.43), (TotalDebt/TotalCapital, 0.16), (TotalDebt/TotalEquity, 0.12)]","[(Cash, 0.009300000000000001), (Equity, 0.9898), (Other, 0.0009)]",,"[(002223-JIANGSUYUYUEMEDICAL..., 0.0332), (300003-LepuMedicalTechnology(.., 0.0288), (300015-AierEyeHospitalGroupC., 0.0712), (300347-HangzhouTigermedConsu..., 0.0374), (300760-ShenzhenMindrayBio-Me..., 0.1024), (300832-ShenzhenNewIndustries..., 0.0316), (300896-IMEIKTECHNOLOGYDEVE..., 0.039), (603259-WuXiAppTecCo.,Ltd., 0.09759999999999999), (688271-ShanghaiUnitedImaging..., 0.0538), (688617-APTMedicalInc, 0.0296)]","[(BasicMaterials, 0.025099999999999997), (ConsumerCyclicals, None), (Healthcare, 0.9384), (Industrials, 0.0), (NonClassifiedEquity, 0.0), (Technology, 0.0152), (TelecommunicationServices, 0.0111), (Utilities, 0.0)]","[(China, 0.9991), (Unidentified, 0.0009)]","[(<NoCurrency>, 0.0009), (ChineseYuan, 0.9991)]",,,
1,159837,SEHKSZSE,SEHKSZSE,SEHKSZSE,CNH,571433109.0,EFD CSI BIOTECH THEMATIC ETF,ETF,CNE100004GF3,2025-02-27,0.0,True,EFDCSIBIOTECHTHEMATICETF(SEHKSZSE),159837,True,"[(Domicile, China), (MarketCapFocus, BroadMarket), (MarketGeoFocus, China), (TotalNetAssets, CNY1.28BaSof2024/12/31)]","[(large-core, False), (large-growth, True), (large-value, False), (mid-core, False), (mid-growth, False), (mid-value, False), (multi-core, False), (multi-growth, True), (multi-value, False), (small-core, False), (small-growth, False), (small-value, False)]","[(ConsistentReturn, 4), (Preservation, 1), (TotalReturn, 4)]","[(EBITtoInterest, 1358.31), (EPSGrowth-1yr, 5.39), (EPS_growth_3yr, 9.5), (EPS_growth_5yr, 15.88), (LTDebt/Shareholders, 0.06), (Price/Book, 4.13), (Price/Cash, 27.28), (Price/Earnings, 30.86), (Price/Sales, 8.77), (RelativeStrength, 5.14), (ReturnonAssets1Yr, 8.44), (ReturnonAssets3Yr, 12.17), (ReturnonCapital, 14.05), (ReturnonCapital3Yr, 17.46), (ReturnonEquity1Yr, 11.03), (ReturnonEquity3Yr, 13.85), (ReturnonInvestment1Yr, 10.18), (ReturnonInvestment3Yr, 15.16), (SalestoTotalAssets, 0.47), (TotalAssets/TotalEquity, 1.4), (TotalDebt/TotalCapital, 0.12), (TotalDebt/TotalEquity, 0.1)]","[(Cash, 0.0104), (Equity, 0.9887), (Other, 0.0009)]",,"[(000661-CHANGCHUNHIGH-TECHI.., 0.0373), (002252-ShanghaiRAASBloodProd..., 0.052199999999999996), (300122-ChongqingZhifeiBiologica..., 0.0421), (300347-HangzhouTigermedConsu.., 0.0317), (300760-ShenzhenMindrayBio-Me..., 0.10210000000000001), (600161-BeijingTiantanBiological..., 0.0363), (600196-ShanghaiFosunPharmace.., 0.0354), (600276-JiangsuHengruiPharmace..., 0.09609999999999999), (603259-WuXiAppTecCo.,Ltd., 0.0974), (603392-BeijingWantaiBiologicalP..., 0.0315)]","[(BasicMaterials, 0.0044), (ConsumerCyclicals, 0.0), (Healthcare, 0.9749), (Industrials, 0.0), (NonClassifiedEquity, 0.0), (Technology, 0.0003), (TelecommunicationServices, 0.0089), (Utilities, 0.0)]","[(CaymanIslands, 0.0167), (China, 0.9823999999999999), (Unidentified, 0.0009)]","[(<NoCurrency>, 0.0009), (ChineseYuan, 0.9991)]",,,
2,159840,SEHKSZSE,SEHKSZSE,SEHKSZSE,CNH,717319886.0,ICBC CNI NEV BATTERY ETF,ETF,CNE100004PT5,2025-02-27,0.0,True,ICBCCNINEVBATTERYETF(SEHKSZSE),159840,False,"[(Domicile, China), (MarketCapFocus, BroadMarket), (MarketGeoFocus, China), (TotalNetAssets, CNY1.14BaSof2024/12/31)]","[(large-core, False), (large-growth, True), (large-value, False), (mid-core, False), (mid-growth, False), (mid-value, False), (multi-core, False), (multi-growth, False), (multi-value, False), (small-core, False), (small-growth, False), (small-value, False)]","[(ConsistentReturn, 3), (Preservation, 1), (TotalReturn, 3)]","[(EBITtoInterest, 252.61), (EPSGrowth-1yr, 13.07), (EPS_growth_3yr, 58.61), (EPS_growth_5yr, 37.73), (LTDebt/Shareholders, 0.31), (Price/Book, 2.54), (Price/Cash, 10.61), (Price/Earnings, 21.16), (Price/Sales, 1.53), (RelativeStrength, 2.65), (ReturnonAssets1Yr, 5.44), (ReturnonAssets3Yr, 7.76), (ReturnonCapital, 8.08), (ReturnonCapital3Yr, 10.52), (ReturnonEquity1Yr, 13.82), (ReturnonEquity3Yr, 17.21), (ReturnonInvestment1Yr, 10.28), (ReturnonInvestment3Yr, 13.19), (SalestoTotalAssets, 0.68), (TotalAssets/TotalEquity, 2.9), (TotalDebt/TotalCapital, 0.84), (TotalDebt/TotalEquity, 0.44)]","[(Cash, 0.0070999999999999995), (Equity, 0.9922), (Other, 0.0007000000000000001)]",,"[(002050-ZHEJIANGSANHUAINTEL., 0.056100000000000004), (002074-GotionHigh-techCo.,Ltd., 0.029500000000000002), (002340-GEMCo.,Ltd., 0.0438), (002460-GanfengLithiumGroupCo..., 0.045), (002594-BYDCOMPANYLIMITED, 0.16940000000000002), (002709-GuangzhouTinciMaterials..., 0.029900000000000003), (300014-EVEEnergyCo.,Ltd., 0.0717), (300207-SUNWODAELECTRONICC..., 0.0297), (300750-ContemporaryAmperexTe..., 0.149), (603799-ZHEJIANGHUAYOUCOBAL.., 0.043)]","[(BasicMaterials, 0.2955), (ConsumerCyclicals, 0.16940000000000002), (Industrials, 0.5126), (NonClassifiedEquity, 0.0), (Technology, 0.0146)]","[(China, 0.9993000000000001), (Unidentified, 0.0007000000000000001)]","[(<NoCurrency>, 0.0007000000000000001), (ChineseYuan, 0.9993000000000001)]",,,
3,1615,JPNNEXT,TSEJ,"SMART,TSEJ,JPNNEXT,CBOE.JPN",JPY,39015931.0,NEXT FUNDS TOPIX BANKS ETF,ETF,JP3040170007,2025-02-27,0.0,True,NEXTFUNDSTOPIXBANKSETF(TSEJ),1615,False,"[(Domicile, Japan), (MarketCapFocus, BroadMarket), (MarketGeoFocus, Japan), (TotalNetAssets, 304350000000.0)]","[(large-core, False), (large-growth, False), (large-value, True), (mid-core, False), (mid-growth, False), (mid-value, False), (multi-core, False), (multi-growth, False), (multi-value, False), (small-core, False), (small-growth, False), (small-value, False)]","[(ConsistentReturn, 5), (Preservation, 3), (TotalReturn, 5)]","[(EBITtoInterest, 0.89), (EPSGrowth-1yr, 24.67), (EPS_growth_3yr, 19.88), (EPS_growth_5yr, 14.19), (LTDebt/Shareholders, 1.16), (Price/Book, 0.98), (Price/Cash, 10.24), (Price/Earnings, 12.69), (Price/Sales, 3.2), (RelativeStrength, 15.62), (ReturnonAssets1Yr, 0.38), (ReturnonAssets3Yr, 0.3), (ReturnonCapital, 1.54), (ReturnonCapital3Yr, 1.2), (ReturnonEquity1Yr, 8.07), (ReturnonEquity3Yr, 6.13), (ReturnonInvestment1Yr, 8.09), (ReturnonInvestment3Yr, 6.38), (SalestoTotalAssets, 0.02), (TotalAssets/TotalEquity, 22.1), (TotalDebt/TotalCapital, 4.44), (TotalDebt/TotalEquity, 0.8)]","[(Equity, 0.9984000000000001), (Other, 0.0016)]","[(Div.YieldTTM, 0.022400000000000003), (DividendTTM, 9.11), (PayoutRatio, 0.9981)]","[(107167-MebukiFinancialGroup,Inc., 0.009399999999999999), (7182-JAPANPOSTBANKCo.,Ltd., 0.0361), (7186-ConcordiaFinancialGroup...., 0.0131), (8306-MitsubishiUFJFinancialGr..., 0.345), (8308-ResonaHoldings,Inc., 0.038900000000000004), (8309-SumitomoMitsuiTrustGro..., 0.037599999999999995), (8316-SumitomoMitsuiFinancial.., 0.2194), (8331-TheChibaBank,Ltd, 0.011200000000000002), (8354-FukuokaFinancialGroup,I., 0.01), (8411-MizuhoFinancialGroup,Inc, 0.158)]","[(Financials, 0.9984000000000001)]","[(Japan, 0.9984000000000001), (Unidentified, 0.0016)]","[(<NoCurrency>, 0.0016), (JapaneseYen, 0.9984000000000001)]",,,
4,1617,TSEJ,TSEJ,"SMART,TSEJ,JPNNEXT,CBOE.JPN",JPY,55275720.0,NF TPX-17 FOODS,ETF,JP3046560003,2025-02-27,0.0,True,NFTPX-17FO0DS(TSEJ),1617,False,"[(Domicile, Japan), (MarketCapFocus, BroadMarket), (MarketGeoFocus, Japan), (TotalNetAssets, 3940000000.0)]","[(large-core, False), (large-growth, False), (large-value, False), (mid-core, False), (mid-growth, False), (mid-value, False), (multi-core, False), (multi-growth, False), (multi-value, True), (small-core, False), (small-growth, False), (small-value, False)]","[(ConsistentReturn, 3), (Preservation, 5), (TotalReturn, 4)]","[(EBITtoInterest, 2089.34), (EPSGrowth-1yr, 21.21), (EPS_growth_3yr, 15.37), (EPS_growth_5yr, 8.46), (LTDebt/Shareholders, 0.24), (Price/Book, 1.97), (Price/Cash, 10.93), (Price/Earnings, 19.89), (Price/Sales, 1.44), (RelativeStrength, 7.7), (ReturnonAssets1Yr, 5.75), (ReturnonAssets3Yr, 5.15), (ReturnonCapital, 7.73), (ReturnonCapital3Yr, 7.47), (ReturnonEquity1Yr, 10.22), (ReturnonEquity3Yr, 9.46), (ReturnonInvestment1Yr, 7.43), (ReturnonInvestment3Yr, 6.76), (SalestoTotalAssets, 0.84), (TotalAssets/TotalEquity, 1.94), (TotalDebt/TotalCapital, 0.35), (TotalDebt/TotalEquity, 0.23)]","[(Equity, 0.9892), (Other, 0.0108)]","[(Div.YieldTTM, 0.015600000000000001), (DividendTTM, 554.0), (PayoutRatio, 0.040999999999999995)]","[(2267-YAKULTHONSHACO.,LTD., 0.032799999999999996), (2269-MeijiHoldingsCo.,Ltd., 0.032400000000000005), (2502-AsahiGroupHoldings,Ltd., 0.10220000000000001), (2503-KirinHoldingsCompany,Li.., 0.0662), (2587-SuntoryBeverage&Food., 0.0275), (2801-KikkomanCorpORD, 0.0436), (2802-AjinomotoCo.,Inc., 0.11689999999999999), (2875-TOYOSUISANKAISHA,LTD., 0.0375), (2897-NISSINFOODSHOLDINGS..., 0.0355), (2914-JAPANTOBACCOINC, 0.19519999999999998)]","[(BasicMaterials, 0.0005), (ConsumerNon-Cyclicals, 0.9876), (Healthcare, 0.0011)]","[(Japan, 0.9892), (Unidentified, 0.0108)]","[(<NoCurrency>, 0.0108), (JapaneseYen, 0.9892)]",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3397,C078,SWB,FWB,"FWB,SWB",EUR,53766411.0,COMSTAGE ETF DJ ST 600 T&L-I,ETP,LU0378437254,2025-02-14,1.0,True,COMSTAGEETFDJST600T&L-I(FWB),C078,False,,,,,,,,,,,,,
3398,C011,SWB,FWB,"FWB,SWB",EUR,53766421.0,COMSTAGE ETF NASDAQ-100,ETP,LU0378449770,2025-02-14,1.0,True,COMSTAGEETFNASDAQ-100(FWB),C011,False,,,,,,,,,,,,,
3399,C100,FWB,FWB,"FWB,SWB",EUR,53766431.0,COMSTAGE ETF EONIA,ETP,LU0378437684,2025-02-14,1.0,True,COMSTAGEETFEONIA(FWB),C100,False,,,,,,,,,,,,,
3400,X508,FWB,FWB,"FWB,SWB",EUR,70600407.0,COMSTAGE EU LQ SV DV 15+TR-I,ETP,LU0444606536,2025-02-14,1.0,True,COMSTAGEEULQSVDV15+TR-I(FWB),X508,False,,,,,,,,,,,,,


In [None]:
# Manual checks
contracts_df = load('data/contract_elaborated.csv')
contracts_df = clean_df(contracts_df)
contracts_df = contracts_df[contracts_df.apply(is_row_valid, axis=1)]

# 'profile', 'style', 'lipper', 'fundamentals', 'holding_types', 'dividends', 'top10', 'industries', 'countries', 'currencies', 'debtors', 'maturity', 'debt_type'
column = 'top10'

filtered_column = contracts_df[column].dropna().tolist()

# Extract elem[1] from all lists
all_second_elements = []
for fundamentals_list in filtered_column:
    if isinstance(fundamentals_list, list):
        for elem in fundamentals_list:
            if isinstance(elem, tuple) and len(elem) > 1:
                all_second_elements.append(elem[1]) # Set 1 to see unique values, 0 to see unique labels

# Get unique values
unique_labels = list(set(all_second_elements))
# unique_labels = list(set([elem if isinstance(elem, str) else 'NUMBSR' for elem in unique_labels]))
# unique_labels = list(set([elem if isinstance(elem, float) else np.nan for elem in unique_labels]))
unique_labels.sort()
unique_labels, len(unique_labels)

In [None]:
bad_labels = ['Equity', 'SalestoTotalAssetsLTDebt/Shareholders'] # Change manually

splice = contracts_df[contracts_df[column].apply(lambda x: isinstance(x, list) and any(elem[0] in bad_labels or elem[0] == None or elem[0] == '' for elem in x))]
symbols = splice['symbol'].to_list()
bad_symbols += symbols
# bad_symbols = symbols
bad_symbols =  list(set(bad_symbols))
display(splice[['symbol','exchange', 'primaryExchange', 'search_exchange', column]])
print(len(bad_symbols))


In [None]:
print(len(bad_symbols))

contracts_df['lengths'] = contracts_df['fundamentals'].apply(lambda x: len(x) if isinstance(x, list) else np.nan)
splice = contracts_df[(contracts_df['lengths'] != 5) & (contracts_df['lengths'] != 22)][['symbol', 'exchange', 'primaryExchange', 'lengths']]
splice = splice[~splice['lengths'].isna()]
symbols = splice['symbol'].to_list()
bad_symbols += symbols
bad_symbols =  list(set(bad_symbols))

print(len(bad_symbols))
## 22 or 5, sometimes 4

---
### Explode
---

In [None]:
contracts_df = load('data/contract_elaborated.csv')
columns_to_explode = ['profile', 'style', 'lipper', 'fundamentals', 'holding_types', 'dividends', 'industries', 'countries', 'currencies', 'debtors', 'maturity', 'debt_type', 'top10']
for col in columns_to_explode:
    contracts_df[col] = contracts_df[col].fillna('[]')
    contracts_df[col] = contracts_df[col].apply(evaluate_literal)

contracts_df = contracts_df.explode('holding_types')
contracts_df[['holding_type', 'value']] = pd.DataFrame(contracts_df['holding_types'].tolist(), index=contracts_df.index)
contracts_df = contracts_df.pivot_table(index=contracts_df.index, columns='holding_type', values='value', aggfunc='first')
contracts_df = pd.concat([contracts_df, contracts_df.groupby(level=0).first()], axis=1)
contracts_df = contracts_df.loc[:,~contracts_df.columns.duplicated()]
contracts_df

In [None]:
'''

growth score = 2 * [ N_P/B + N_P/E + N_P/Cash + N_P/Sales + N_EPS_growth_1yr + N_EPS_growth_3yr + N_EPS_growth_5yr + 
              N_ReturnonAssets1Yr + N_ReturnonAssets3Yr + N_ReturnonCapital + N_ReturnonCapital3Yr + 
              N_ReturnonEquity1Yr + N_ReturnonEquity3Yr + N_ReturnonInvestment1Yr + N_ReturnonInvestment3Yr + 
              N_SalestoTotalAssets + N_EBITtoInterest + N_RelativeStrength + 
              (1 - N_LTDebt/ShareholdersEquity) + (1 - N_TotalAssets/TotalEquity) + 
              (1 - N_TotalDebt/TotalCapital) + (1 - N_TotalDebt/TotalEquity) ] / 22 - 1

Extreme Growth: If all growth indicators ≈ 1 and value indicators ≈ 0, then S = [18*1 + 4*1]/22 = 1, score = 2*1 - 1 = 1.
Extreme Value: If all growth indicators ≈ 0 and value indicators ≈ 1, then S = [18*0 + 4*0]/22 = 0, score = 2*0 - 1 = -1.
Neutral: If all ≈ 0.5, then S = [18*0.5 + 4*0.5]/22 = 0.5, score = 2*0.5 - 1 = 0.


Step 4: Proposed Refined Model
Balancing your suggestions with practicality and Morningstar’s framework, I recommend:
Select Key Metrics: Use only the most relevant IBKR metrics.
Equal Weighting Within Categories: Follow Morningstar’s approach for simplicity and grounding.
Score Calculation: Compute a value-growth spectrum from -1 to 1.
Refined Model
Value Score = mean((1 - N_P/B) + (1 - N_P/Sales) + (1 - N_P/Cash) + (1 - N_P/E)) # Possibly add: LTDebt/ShareholdersEquity, TotalDebt/Equity
Growth Score = mean(N_EPS_growth_1yr + N_EPS_growth_3yr + N_EPS_growth_5yr) # Possibly add: ReturnonAssets, SalestoTotalAssets

Why This Works
Relevance: Uses metrics tied to Morningstar’s historical measures and value investing principles.
Simplicity: Equal weighting avoids overcomplication while mirroring industry practice.
No Additional Standardization: Normalization suffices for comparability.
Flexibility: Captures the spectrum effectively with available data.

'''
