In [None]:
import os
from ib_async import *
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm.auto import tqdm
from time import sleep
import math
import re
import traceback


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
def has_multiplier(long_name):
    multiplier_pattern = re.compile(r'\d+X|X\d+')
    cleaned = long_name.replace('-', ' ').replace('+', ' ')

    return any(
        multiplier_pattern.fullmatch(word) and int(word.replace('X', '', 1)) != 1
        for word in cleaned.split()
    )

# kind = 'midpoint'
kind = 'trades'

if kind == 'midpoint':
    root = 'data/daily-midpoint/'
elif kind == 'trades':
    root = 'data/daily-trades/'

data_path = root + 'series/'
verified_path = root + 'verified_files.csv'

fund_df = pd.read_csv('data/fundamentals.csv')
error = False
try:
    verified_df = pd.read_csv(verified_path)
except FileNotFoundError:
    error = True

if error or input('Update verified files? (y/n)').lower().strip() == 'y':
    util.startLoop()
    ib = IB()
    ib.connect('127.0.0.1', 7497, clientId=4)

    file_list = os.listdir(data_path)
    verified_files = []

    for file_name in tqdm(file_list, total=len(file_list), desc="Verifying files"):
        if not file_name.endswith('.csv'):
            continue

        file_path = os.path.join(data_path, file_name)
        try:
            match = re.match(r'^(.*?)-([A-Z0-9]+)-([A-Z]{3})\.csv$', file_name)
            if not match:
                print(f"Deleting malformed filename: {file_name}")
                os.remove(file_path)
                continue
            symbol, exchange, currency = match.groups()
            
            symbol_data = fund_df[(fund_df['symbol'] == symbol) & (fund_df['currency'] == currency)]
            if symbol_data.empty:
                print(f"No fundamental data for {symbol}. Deleting file.")
                os.remove(file_path)
                continue

            contract_details = ib.reqContractDetails(Stock(symbol, exchange, currency))
            if not contract_details:
                print(f"No contract details from IBKR for {symbol}. Deleting file.")
                os.remove(file_path)
                continue

            conid = contract_details[0].contract.conId
            if conid not in symbol_data['conId'].values:
                print(f"conId mismatch")
                if conid not in fund_df['conId'].values:
                    os.remove(file_path)
                continue

            instrument_name = symbol_data['longName'].iloc[0]
            if has_multiplier(instrument_name):
                print(f"Leveraged instrument detected: {instrument_name}. Deleting file.")
                os.remove(file_path)
                continue

            verified_files.append({'symbol': symbol, 'currency': currency, 'exchange': exchange, 'conId': conid})
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    if 'verified_df' in globals():
        verified_df = pd.concat([verified_df, pd.DataFrame(verified_files)], ignore_index=True).drop_duplicates()
    else:
        verified_df = pd.DataFrame(verified_files)
    verified_df.to_csv(verified_path, index=False)

    ib.disconnect()

In [None]:
def get_historical(symbol, exchange, currency, duration='40 Y', kind=None):
    contract = Stock(symbol, exchange, currency)
    if kind == 'midpoint':
        data = ib.reqHistoricalData(
            contract, 
            endDateTime='',
            durationStr=duration,
            barSizeSetting='1 day', 
            whatToShow='MIDPOINT', 
            useRTH=True,
        )
    elif kind == 'trades':
        data = ib.reqHistoricalData(
            contract, 
            endDateTime='',
            durationStr=duration,
            barSizeSetting='1 day', 
            whatToShow='TRADES', 
            useRTH=True,
        )
    else:
        raise Exception('Incorrect kind')
    
    length = len(data) - 1 if data and exchange == 'SMART' else len(data)
    return data, length, exchange

def save_data(data_path, data, symbol, exchange, currency):
    if data:
        data_df = util.df(data)
        data_df['date'] = pd.to_datetime(data_df['date']).dt.date
        data_df = data_df.sort_values(by='date').reset_index(drop=True)
        data_df.to_csv(f'{data_path}{symbol}-{exchange}-{currency}.csv', index=False)
        # print(f'{symbol} saved')

In [None]:
def fill_internal_gaps(df, symbol, exchange, currency):
    if df.empty or 'date' not in df.columns:
        return df

    df['date'] = pd.to_datetime(df['date']).dt.date
    df = df.set_index('date').sort_index()
    attempted_gaps = set()

    while True:
        first_date, last_date = df.index.min(), df.index.max()
        business_days = pd.date_range(start=first_date, end=last_date, freq='B')
        full_df = df.reindex(business_days)
        is_missing = full_df['open'].isnull()
        if not is_missing.any():
            break

        full_df['gap_id'] = (is_missing.diff() != 0).cumsum()
        missing_blocks = full_df[is_missing]
        gap_info = missing_blocks.groupby('gap_id').agg(
            gap_start=('gap_id', lambda x: x.index.min()),
            gap_end=('gap_id', lambda x: x.index.max()),
            gap_size=('gap_id', 'size')
        )
        significant_gaps = gap_info[gap_info['gap_size'] >= MIN_GAP_SIZE_TO_FILL].copy()
        significant_gaps['gap_tuple'] = significant_gaps.apply(lambda row: (row['gap_start'], row['gap_end']), axis=1)
        gaps_to_try = significant_gaps[~significant_gaps['gap_tuple'].isin(attempted_gaps)]
        if gaps_to_try.empty:
            break

        current_gap = gaps_to_try.iloc[0]
        attempted_gaps.add(current_gap['gap_tuple'])

        start_fill_date = current_gap['gap_start'] - timedelta(days=1)
        end_fill_date = current_gap['gap_end'] + timedelta(days=1)
        days_to_request = (end_fill_date - start_fill_date).days + 1

        duration_str = f'{math.ceil(days_to_request / 365)} Y' if days_to_request >= 365 else f'{days_to_request} D'
        end_date_str = end_fill_date.strftime('%Y%m%d 00:00:00')

        contract = Stock(symbol, exchange, currency)
        gap_data = ib.reqHistoricalData(
            contract,
            endDateTime=end_date_str,
            durationStr=duration_str,
            barSizeSetting='1 day',
            whatToShow=kind,
            useRTH=True,
            timeout=30 
        )

        if gap_data:
            gap_df = util.df(gap_data)
            gap_df['date'] = pd.to_datetime(gap_df['date']).dt.date
            gap_df = gap_df.set_index('date')
            df = df.combine_first(gap_df)
        else:
            print(f"[{symbol}] API returned no data for gap. Marked as attempted, continuing scan.")

    return df.reset_index().sort_values(by='date')

In [None]:
util.startLoop()
ib = IB()
ib.connect('127.0.0.1', 7497, clientId=3)

In [None]:
# Get missing historical series
years = ['40 Y', '20 Y', '10 Y', '5 Y', '3 Y', '2 Y', '1 Y']
# years = ['40 Y']
for duration in years:
    fund_df = pd.read_csv('data/fundamentals.csv')
    fund_df['search_exchange'] = fund_df['search_exchange'].str.extract(r'\((.*?)\)').fillna('')

    # Create a set of (symbol, currency) tuples from existing files
    file_list = os.listdir(data_path)
    file_keys = set()
    for name in file_list:
        if name.endswith('.csv'):
            try:
                symbol, exchange, currency = name.replace('.csv', '').split('-')
                file_keys.add((symbol, currency))
            except ValueError:
                print(f"Skipping malformed filename: {name}")

    # Identify missing symbols based on symbol and currency
    missing_symbols = fund_df[
        ~fund_df.apply(lambda row: (row['symbol'], row['currency']) in file_keys, axis=1)
    ].copy()

    count = 0
    for _, row in tqdm(missing_symbols.iterrows(), total=len(missing_symbols), desc=f"Getting {duration} series"):
        symbol = row['symbol']
        search_exchange = row['search_exchange']
        suggested_exchange = row['exchange']
        primary_exchange = row['primaryExchange']
        currency = row['currency']
        
        results = []
        if search_exchange:
            results.append(get_historical(symbol, search_exchange, currency, duration=duration, kind=kind))
            if suggested_exchange != search_exchange:
                results.append(get_historical(symbol, suggested_exchange, currency, duration=duration, kind=kind))
            if primary_exchange != suggested_exchange and primary_exchange != search_exchange:
                results.append(get_historical(symbol, primary_exchange, currency, duration=duration, kind=kind))
        else:
            results.append(get_historical(symbol, suggested_exchange, currency, duration=duration, kind=kind))
            if primary_exchange != suggested_exchange:
                results.append(get_historical(symbol, primary_exchange, currency, duration=duration, kind=kind))
        results.append(get_historical(symbol, 'SMART', currency, duration=duration, kind=kind))

        # Sort by data length and save the best result
        results = sorted(results, key=lambda x: x[1], reverse=True)
        if results[0][1]:
            save_data(data_path, results[0][0], symbol, results[0][2], currency)
            count +=1

    print(f'{duration}: {count} scraped')

In [None]:
# Update historical series
OVERLAP_BUFFER_DAYS = 5
MIN_GAP_SIZE_TO_FILL = 5

fill_internal = input('fill internal gaps?(y/n)').lower().strip() == 'y'
file_list = os.listdir(data_path)
for file_name in tqdm(file_list, total=len(file_list), desc=f"Updating series"):
    if not file_name.endswith('.csv'):
        print(f'skipping {file_name}')
        continue

    match = re.match(r'^(.*?)-([A-Z0-9]+)-([A-Z]{3})\.csv$', file_name)
    if not match:
        print(f"Skipping malformed filename: {file_name}")
        continue
    symbol, exchange, currency = match.groups()

    file_path = os.path.join(data_path, file_name)
    temp_file_path = file_path + '.tmp'

    try:
        if os.path.getsize(file_path) == 0:
            print(f"Skipping empty file: {file_name}")
            continue

        data_df = pd.read_csv(file_path)
        if 'date' not in data_df.columns or data_df.empty:
            print(f"Skipping file with no 'date' column or no data: {file_name}")
            continue
        data_df['date'] = pd.to_datetime(data_df['date']).dt.date

        if fill_internal:
            data_df = fill_internal_gaps(data_df, symbol, exchange, currency)
            
        last_date = data_df['date'].max()
        days_missing = (datetime.now().date() - last_date).days
        if days_missing > 1:
            days_to_request = days_missing + OVERLAP_BUFFER_DAYS
            duration = f'{math.ceil(days_to_request / 365)} Y' if days_to_request >= 365 else f'{days_to_request} D'
            new_data, _, _ = get_historical(symbol, exchange, currency, duration=duration, kind=kind)

            if new_data:
                new_data_df = util.df(new_data)
                new_data_df['date'] = pd.to_datetime(new_data_df['date']).dt.date
                
                updated_data_df = pd.concat([data_df, new_data_df], ignore_index=True)
                updated_data_df = updated_data_df.drop_duplicates(subset='date', keep='last')
            else:
                updated_data_df = data_df
        else:
            updated_data_df = data_df
        
        updated_data_df = updated_data_df.sort_values(by='date').reset_index(drop=True)
        updated_data_df.to_csv(temp_file_path, index=False)
        os.rename(temp_file_path, file_path)

    except pd.errors.EmptyDataError:
        print(f"Error: CSV file is empty. Skipping {file_name}")
        continue
    except Exception as e:
        print(f"An unexpected error occurred while processing {file_name}: {e}")
        traceback.print_exc()
        if os.path.exists(temp_file_path):
            os.remove(temp_file_path)
        continue

In [None]:
# # Update historical series OLD
# file_list = os.listdir(data_path)

# for file_name in tqdm(file_list, total=len(file_list), desc=f"Updating series"):
#     try:
#         if file_name.endswith('.csv'):
#             try:
#                 symbol, exchange, currency = file_name.replace('.csv', '').split('-')
#                 file_keys.add((symbol, currency))
#             except ValueError:
#                 continue
        
#         file_path = os.path.join(data_path, file_name)
#         data_df = pd.read_csv(file_path)
#         data_df['date'] = pd.to_datetime(data_df['date'])
#         last_date = data_df['date'].max()
#         time_missing = (datetime.now() - last_date).days
#         if time_missing > 364:
#             time_missing = math.ceil(time_missing / 364)
#             duration = f'{time_missing} Y'
#         else:
#             duration = f'{time_missing} D'
        
#         if time_missing:
#             new_data,_,_ = get_historical(symbol, exchange, currency, duration=duration, kind=kind)
#             if new_data:
#                 new_data_df = util.df(new_data)
#                 new_data_df['date'] = pd.to_datetime(new_data_df['date'])
#                 updated_data_df = pd.concat([new_data_df, data_df]).drop_duplicates(subset='date').sort_values(by='date').reset_index(drop=True)
#                 updated_data_df.to_csv(file_path, index=False)
#     except Exception as e:
#         print(f"Error processing {file_name}: {e}")
#         continue

In [None]:
# # Define verified files
# try:
#     with open(verified_path, 'r') as f:
#         lines = f.readlines()
#         verified_files = [line.strip() for line in lines]
#     file_list = os.listdir(data_path)

# except FileNotFoundError:
#     util.startLoop()
#     ib = IB()
#     ib.connect('127.0.0.1', 7497, clientId=1)

#     file_list = os.listdir(data_path)
#     contracts_df = pd.read_csv('data/contract_elaborated.csv')

#     verified_files = []
#     for file_name in tqdm(file_list, total=len(file_list)):
#         symbol, exchange, currency = file_name.replace('.csv', '').split('-')
#         try:
#             contract_details = ib.reqContractDetails(Stock(symbol, exchange, currency))
#             if not contract_details:
#                 continue
#             id = contract_details[0].secIdList[0].value

#             if contracts_df[contracts_df['symbol'] == symbol]['isin'].iloc[0] == id:
#                 instrument_name = contracts_df[contracts_df['symbol'] == symbol]['longName'].iloc[0]
#                 instrument_name = instrument_name.replace('-', '').replace('+', '')
#                 for word in instrument_name.split():
#                     if re.fullmatch(r'\d+X', word):
#                         if int(word[:-1]) > 1:
#                             continue
#                         if word.startswith(('LV', 'LEV')):
#                             print(f'    {instrument_name}')
                            
#                 verified_files.append(file_name.split('-')[0])
#         except Exception as e:
#             # if e.args and len(e.args) > 0 and e.args[0] != 'open orders request timed out':
#             print(e)

#     with open(verified_path, 'w') as f:
#         for item in verified_files:
#             f.write(str(item) + '\n')

#     ib.disconnect()