In [None]:
import os
from ib_async import *
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
from time import sleep
import math
import re


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
kind = 'midpoint'
# kind = 'trades'
# kind = 'indices'

if kind == 'midpoint':
    root = 'data/daily-midpoint/'
elif kind == 'trades':
    root = 'data/daily-trades/'
elif kind == 'indices':
    root = 'data/indices/'

data_path = root + 'series/'
verified_path = root + 'verified_files.txt'

In [None]:
# Connect to ibkr
util.startLoop()

ib = IB()
ib.connect('127.0.0.1', 7497, clientId=1)

In [None]:
def get_historical(symbol, exchange, currency, duration='40 Y', kind=None):
    contract = Stock(symbol, exchange, currency)
    if kind == 'midpoint':
        data = ib.reqHistoricalData(
            contract, 
            endDateTime='',
            durationStr=duration,
            barSizeSetting='1 day', 
            whatToShow='MIDPOINT', 
            useRTH=True,
        )
    elif kind == 'trades' or kind == 'indices':
        data = ib.reqHistoricalData(
            contract, 
            endDateTime='',
            durationStr=duration,
            barSizeSetting='1 day', 
            whatToShow='TRADES', 
            useRTH=True,
        )
    length = len(data) - 1 if data and exchange == 'SMART' else len(data)
    return data, length, exchange

def save_data(data_path, data, symbol, exchange, currency):
    if data:
        data_df = util.df(data)
        data_df['date'] = pd.to_datetime(data_df['date'])
        data_df = data_df.sort_values(by='date').reset_index(drop=True)
        data_df.to_csv(f'{data_path}{symbol}-{exchange}-{currency}.csv', index=False)
        # print(f'{symbol} saved')

In [None]:
# Get missing historical series
if kind == 'indices':
    raise Exception('Incorrect kind for this. Needs to be trades or midpoint)')

years = ['40 Y', '20 Y', '10 Y', '5 Y', '3 Y', '2 Y', '1 Y']
# years = ['1 Y']
for duration in years:
    contracts_df = pd.read_csv('data/contract_elaborated.csv')
    
    contracts_df['search_exchange'] = contracts_df['search_exchange'].str.extract(r'\((.*?)\)').fillna('')
    file_list = os.listdir(data_path)
    file_list = [name.split('-')[0] for name in file_list]

    missing_symbols = contracts_df[~contracts_df['symbol'].isin(file_list)].copy()
    count = 0
    for _, row in tqdm(missing_symbols.iterrows(), total=len(missing_symbols), desc=f"Getting {duration} series"):
        symbol = row['symbol']
        search_exchange = row['search_exchange']
        suggested_exchange = row['exchange']
        primary_exchange = row['primaryExchange']
        currency = row['currency']
        
        results = []
        if search_exchange:
            results.append(get_historical(symbol, search_exchange, currency, duration=duration, kind=kind))
            if suggested_exchange != search_exchange:
                results.append(get_historical(symbol, suggested_exchange, currency, duration=duration, kind=kind))
            if primary_exchange != suggested_exchange and primary_exchange != search_exchange:
                results.append(get_historical(symbol, primary_exchange, currency, duration=duration, kind=kind))
        else:
            results.append(get_historical(symbol, suggested_exchange, currency, duration=duration, kind=kind))
            if primary_exchange != suggested_exchange:
                results.append(get_historical(symbol, primary_exchange, currency, duration=duration, kind=kind))
        results.append(get_historical(symbol, 'SMART', currency, duration=duration, kind=kind))
        results = sorted(results, key=lambda x: x[1], reverse=True)
        if results[0][1]:
            save_data(data_path, results[0][0], symbol, results[0][2], currency)
            count +=1

    print(f'{duration}: {count} scraped')

In [None]:
# # Check scraping differences
# def get_csv_lengths(directory):
#     """Returns a dictionary mapping (symbol, exchange, currency) to their row counts."""
#     csv_lengths = {}
#     for file in os.listdir(directory):
#         if file.endswith(".csv"):
#             file_path = os.path.join(directory, file)
#             try:
#                 df = pd.read_csv(file_path)
#                 file_key = os.path.splitext(file)[0]  # Remove .csv extension
#                 parts = file_key.split('-')
#                 if len(parts) == 3:
#                     symbol, exchange, currency = parts
#                     csv_lengths[(symbol, exchange, currency)] = len(df)
#             except Exception as e:
#                 print(f"Error reading {file_path}: {e}")
#                 csv_lengths[(symbol, exchange, currency)] = None
#     return csv_lengths

# def main(dir1, dir2, dir3):
#     """Generates a DataFrame with symbol, exchange, currency, and row counts from three directories."""
#     lengths1 = get_csv_lengths(dir1)
#     lengths2 = get_csv_lengths(dir2)
#     lengths3 = get_csv_lengths(dir3)
    
#     # Collect all unique (symbol, exchange, currency) combinations
#     all_keys = set(lengths1.keys()) | set(lengths2.keys()) | set(lengths3.keys())
    
#     data = []
#     for key in sorted(all_keys):
#         symbol, exchange, currency = key
#         data.append([symbol, exchange, currency, lengths1.get(key, 'N/A'), lengths2.get(key, 'N/A'), lengths3.get(key, 'N/A')])
    
#     return pd.DataFrame(data, columns=["symbol", "exchange", "currency", dir1, dir2, dir3])

# # Example usage
# dir1 = "data/indices/series/"
# dir2 = "data/daily-midpoint/series/"
# dir3 = "data/daily-trades/series/"
# df = main(dir1, dir2, dir3)
# # df[df.duplicated(subset='symbol', keep=False)]

In [None]:
# # Get indices
# '''
# SPY - SnP 500 -- US centric
# VTI - Vanguard Total Stock Market -- US centric
# VEU - Vanguard All-World Ex-US -- Global
# VXUS - Vanguard Total International -- Global
# BND - Vanguard Total Bond -- US
# BNDX - Vanguard Total International Bond -- Global
# '''
# indices = [Stock('SPY', 'SMART', 'USD'), Stock('VTI', 'SMART', 'USD'), Stock('VEU', 'SMART', 'USD'), Stock('VXUS', 'SMART', 'USD'), Stock('BND', 'SMART', 'USD'), Stock('BNDX', 'SMART', 'USD')]
# index_path = 'data/indices/series/'
# for contract in tqdm(indices, total=len(indices), desc=f"Getting index series"):
#     data,_,_ = get_historical(data, contract.symbol, contract.exchange, contract.currency, kind=kind)
#     save_data(index_path, data, contract.symbol, contract.exchange, contract.currency)

In [None]:
# Update historical series
file_list = os.listdir(data_path)

for file_name in tqdm(file_list, total=len(file_list), desc=f"Updating {data_path}"):
    symbol, exchange, currency = file_name.replace('.csv', '').split('-')
    
    file_path = os.path.join(data_path, file_name)
    data_df = pd.read_csv(file_path)
    data_df['date'] = pd.to_datetime(data_df['date'])
    last_date = data_df['date'].max()
    time_missing = (datetime.now() - last_date).days
    if time_missing > 364:
        time_missing = math.ceil(time_missing / 364)
        duration = f'{time_missing} Y'
    else:
        duration = f'{time_missing} D'
    
    if time_missing:
        new_data,_,_ = get_historical(symbol, exchange, currency, duration=duration, kind=kind)
        if new_data:
            new_data_df = util.df(new_data)
            new_data_df['date'] = pd.to_datetime(new_data_df['date'])
            updated_data_df = pd.concat([new_data_df, data_df]).drop_duplicates(subset='date').sort_values(by='date').reset_index(drop=True)
            updated_data_df.to_csv(file_path, index=False)

In [None]:
# Load and prepare indices
def melt(data_df):
    value_columns = ['open', 'close']
    id_columns = [col for col in data_df.columns.to_list() if col not in value_columns]
    melted_df = data_df.melt(id_vars=id_columns, value_vars=value_columns, var_name='kind', value_name='value')
    return melted_df.sort_values(by=['date', 'kind'], ascending=[True, False]).reset_index(drop=True)

# Load indices and merge them all into one df
indices = {}
file_list = os.listdir('data/indices/series/')
for file in file_list:
    symbol = os.path.splitext(file)[0].split('-')[0]
    indices[symbol] = pd.read_csv('data/indices/series/' + file)

# Melt indices, filters, and calc pct_change. ASSUMES that indices are sorted chronologically
training_start_date = pd.to_datetime('2020-02-01')
month_ago = datetime.today() - timedelta(days=31)

day_gap = 6 # SET ACCEPTABLE DAY GAP

melted_indices, index_returns = [], {}
for symbol, df in tqdm(indices.items(), total=len(indices), desc=f'Melting and filtering {kind} indices'):
    df = melt(df)
    df['date'] = pd.to_datetime(df['date'])

    latest_date = df['date'].iloc[-1]
    earliest_date = df['date'].iloc[0]
    dates = df['date'].unique()
    date_gaps = dates[1:] - dates[:-1]
    df['symbol'] = symbol
    df['pct_change'] = df['value'].pct_change()
    index_returns[symbol] = df['pct_change'].mean()
    melted_indices.append(df)
print(f'Loaded {len(melted_indices)} out of {len(file_list)} series ({round(len(melted_indices)/len(file_list)*100, 4)}%)')

# Concatenate and pivot data
index_df = pd.concat(melted_indices, ignore_index=True)
index_df = index_df.pivot(index=['date', 'kind'], columns='symbol', values='pct_change')
index_df = index_df.sort_values(by=['date', 'kind'], ascending=[True, False]).reset_index()#.dropna()

In [None]:
# Define verified files
try:
    with open(verified_path, 'r') as f:
        lines = f.readlines()
        verified_files = [line.strip() for line in lines]
    file_list = os.listdir(data_path)

except FileNotFoundError:
    util.startLoop()
    ib = IB()
    ib.connect('127.0.0.1', 7497, clientId=1)

    file_list = os.listdir(data_path)
    contracts_df = pd.read_csv('data/contract_elaborated.csv')

    verified_files = []
    for file_name in tqdm(file_list, total=len(file_list)):
        symbol, exchange, currency = file_name.replace('.csv', '').split('-')
        try:
            contract_details = ib.reqContractDetails(Stock(symbol, exchange, currency))
            if not contract_details:
                continue
            id = contract_details[0].secIdList[0].value

            if contracts_df[contracts_df['symbol'] == symbol]['isin'].iloc[0] == id:
                instrument_name = contracts_df[contracts_df['symbol'] == symbol]['longName'].iloc[0]
                instrument_name = instrument_name.replace('-', '').replace('+', '')
                for word in instrument_name.split():
                    if re.fullmatch(r'\d+X', word):
                        if int(word[:-1]) > 1:
                            continue
                        if word.startswith(('LV', 'LEV')):
                            print(f'    {instrument_name}')
                            
                verified_files.append(file_name.split('-')[0])
        except Exception as e:
            # if e.args and len(e.args) > 0 and e.args[0] != 'open orders request timed out':
            print(e)

    with open(verified_path, 'w') as f:
        for item in verified_files:
            f.write(str(item) + '\n')

    ib.disconnect()