In [102]:
from IPython.display import clear_output
import pandas as pd
import pprint
import PyPDF2 as pdf
import random
import re
import requests
import textract
import uuid
import time

# Config

In [90]:
data_directory = 'data/'
prefix = 'out_'
mock = False # Mock website connection

# Helper Functions

In [4]:
def extract_from_page(url, pattern):
    headers = {
        'accept': '*/*',
        'accept-language': 'en,de-DE;q=0.9,de;q=0.8,en-US;q=0.7',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Brave Chrome/83.0.4103.116 Safari/537.36',
    }
    response = requests.get(url, headers=headers)
    if response.status_code > 299:
        print(f'Error extracting from {url}: Status {response.status_code} Reason {response.reason}')
        return None
    html = response.text
    match = re.search(pattern, html)
    if match:
        return match.group(1)
    else:
        # print(f'Error extracting from {url}: No match found {pattern} in {html}')
        return None

In [5]:
def get_name(ticker):
    if mock:
        return "Mocked Company Name"
    # Try barchart
    barchart_result = extract_from_page(f'https://www.barchart.com/stocks/quotes/{ticker}/overview', 
                                        '<span class="symbol">([^<]+)</span>')
    if barchart_result:
        return barchart_result
    
    # Try yahoo
    yahoo_result =  extract_from_page(f'https://finance.yahoo.com/quote/{ticker}', 
                                        '<h1 class="D\(ib\) Fz\(18px\)">([^<]+) \([A-Z]+\)\</h1>')
    if barchart_result:
        return barchart_result
    
    # if still no result try marketwatch (takes around 10s)
    marketwatch_result = extract_from_page(f'https://www.marketwatch.com/investing/stock/{ticker}',
                                           '.*<h1 class="company__name">([^<]+)</h1>.*')
    
    if marketwatch_result:
        return marketwatch_result
    else:
        print(f'Invalid ticker: {ticker}')


In [91]:
extract_from_page(f'https://www.barchart.com/stocks/quotes/TSLA/overview', 
                                        '<span class="symbol">([^<]+)</span>')

'Tesla Inc'

In [92]:
extract_from_page(f'https://finance.yahoo.com/quote/TSLA', 
                                        '<h1 class="D\(ib\) Fz\(18px\)">([^<]+) \([A-Z]+\)\</h1>')

'Tesla, Inc.'

In [93]:
extract_from_page(f'https://www.marketwatch.com/investing/stock/TSLA',
                                           '.*<h1 class="company__name">([^<]+)</h1>.*')

'Tesla Inc.'

In [94]:
def get_exchange_online(ticker):
    if mock:
        return "Mock Exchange"
    
    exchange_alias = {
        'NYSE': 'New York Stock Exchange Inc.'
    }
    # Try barchart
    barchart_result = extract_from_page(f'https://www.barchart.com/stocks/quotes/{ticker}/overview', 
                                        '<span class="symbol-trade-time">\[([^\]]+)\]<\/span>')
    if barchart_result:
        exchange = barchart_result
    else:
        print(f'No exchange found for: {ticker}')
        return
    if exchange in exchange_alias:
        return exchange_alias[exchange]
    else:
        return exchange

In [95]:
get_exchange_online('KO')

'New York Stock Exchange Inc.'

In [96]:
get_exchange_online('TSLA')

'NASDAQ'

In [103]:
def track_progress(function, arguments_list):
    n = len(arguments_list)
    return_values = []
    for i, a in enumerate(arguments_list):
        return_values.append(function(a))
        clear_output(wait=True)
        print( '%.2f' % (100*(i+1)/n), '% done', end='\r',flush=True)
    return return_values

In [105]:
res = track_progress(lambda _: time.sleep(0.01), [i for i in range(100)])

100.00 % done

# Type Tables

In [106]:
fi_types = ['stock', 'option', 'swap', 'future', 'forward', 'equity-linked security']

In [107]:
fi_types_df = pd.DataFrame(fi_types, columns = ['mnemonic'])
fi_types_df.index.name = 'fi_type_code'
fi_types_df

Unnamed: 0_level_0,mnemonic
fi_type_code,Unnamed: 1_level_1
0,stock
1,option
2,swap
3,future
4,forward
5,equity-linked security


In [108]:
fi_types_df.to_csv(f'{data_directory}{prefix}fi_type.csv', header=False)

In [109]:
option_styles = ['european', 'american', 'asian']

In [110]:
option_styles_df = pd.DataFrame(option_styles, columns = ['mnemonic'])
option_styles_df.index.name = 'option_style_code'
option_styles_df

Unnamed: 0_level_0,mnemonic
option_style_code,Unnamed: 1_level_1
0,european
1,american
2,asian


In [111]:
option_styles_df.to_csv(f'{data_directory}{prefix}option_style.csv', header=False)

# Trader

In [112]:
in_trader_df = pd.read_csv(data_directory + '15153231.txt', delimiter = '\t' )
in_trader_df.head()

Unnamed: 0,FIRST,LAST,NICKNAME,EMP-NO,2019 BONUS,2019 BASE,2019 TOTAL COMP,2019 PERFORMANCE,DECISION
0,Madyson,Holden,Admiral,4234,248478,78490,326968,4,keep
1,Rowan,Owens,Amazon,2344,227649,3895,231544,7,keep
2,Bridger,Mcclain,Bean,2122,53720,84662,138382,5,keep
3,Maren,Adkins,Beast,3221,242405,90037,332442,2,FIRE
4,Amira,Colon,Big Nasty,2366,46265,77211,123476,6,keep


Pad employee id with zeroes to allow for more than 9999 employees (high ids indicate that an overflow is close):

In [113]:
in_trader_df['emp_id'] = [str(e).zfill(8) for e in in_trader_df['EMP-NO']]

Rename column to lowercase naming convention

In [114]:
in_trader_df['nickname'] = in_trader_df['NICKNAME']

Identify duplicate values, keep first occurence

In [115]:
in_trader_df['duplicated'] = in_trader_df.duplicated(subset=['emp_id'])
in_trader_df[in_trader_df['duplicated'] == True]

Unnamed: 0,FIRST,LAST,NICKNAME,EMP-NO,2019 BONUS,2019 BASE,2019 TOTAL COMP,2019 PERFORMANCE,DECISION,emp_id,nickname,duplicated
11,Sanai,Mccann,Chuckles,4234,65174,40579,105753,6,keep,4234,Chuckles,True


Remove duplicates and create table ready for import

In [116]:
trader_df = pd.DataFrame(in_trader_df[in_trader_df['duplicated'] == False][['emp_id', 'nickname']])
trader_df.head()

Unnamed: 0,emp_id,nickname
0,4234,Admiral
1,2344,Amazon
2,2122,Bean
3,3221,Beast
4,2366,Big Nasty


In [117]:
trader_df.to_csv(f'{data_directory}{prefix}trader.csv', header=False, index=False)

# Counterparty

In [118]:
in_cp_df = pd.read_csv(data_directory + 'COUNTERPARTY.csv')
in_cp_df.head()

Unnamed: 0,NAME,ASSETS,PARENT
0,China Industrial and Commercial Bank of China,4009.26,-1
1,China China Construction Bank Corporation,3400.25,-1
2,China Agricultural Bank of China,3235.65,-1
3,China Bank of China,2991.9,-1
4,Japan Mitsubishi UFJ Financial Group,2784.74,-1


The counterparties in this file are all external as opposed to internal counterparties

In [119]:
in_cp_df['cp_type'] = ['external' for _ in range(len(in_cp_df))]

Generate GUID for counterparties:

In [120]:
in_cp_df['guid'] = [str(uuid.uuid4()) for _ in range(len(in_cp_df))]

In [121]:
in_cp_df['name'] = in_cp_df['NAME']

inspecting for duplicates:

In [122]:
in_cp_df['duplicated'] =in_cp_df.duplicated(subset=['name'])
in_cp_df[in_cp_df['duplicated'] == True]

Unnamed: 0,NAME,ASSETS,PARENT,cp_type,guid,name,duplicated


In [123]:
cp_df = in_cp_df[['guid', 'cp_type', 'name']]
cp_df.head()

Unnamed: 0,guid,cp_type,name
0,9671a05f-9011-4fe6-be85-d61ce293c8a7,external,China Industrial and Commercial Bank of China
1,383d5009-3e0f-4973-ae6c-2d04f3f6c6cd,external,China China Construction Bank Corporation
2,9b9a81d2-f9b1-440d-8a7c-a166c34f337a,external,China Agricultural Bank of China
3,f1bda210-8626-4d9d-8217-3ca41de36f38,external,China Bank of China
4,e48611b9-0577-4edb-840b-4ee660f5d882,external,Japan Mitsubishi UFJ Financial Group


In [124]:
cp_df.to_csv(f'{data_directory}{prefix}counterparty.csv', header=False, index=False)

# Stock

Collect stocks from all file that are to be loaded

## FIRSTPRICES.pdf

**MANUAL STEP**: Copy content of FIRSTPRICES.pdf to text editor and save as FIRSTPRICES.txt

In [125]:
in_trades_df = pd.read_csv(data_directory + 'FIRSTPRICES.txt', delimiter=' ')
in_trades_df.head()

Unnamed: 0,TICKER,ACTION,SHARES,LEVERAGE,PRICE
0,META,B,20001,1.0,912.11
1,DHY,B,401,2.1,2.14
2,FAX,B,2440,3.3,3.28
3,DSU,S,99393,4.0,92.11
4,XOM,B,266,1.0,213.22


Check for missing values:

In [126]:
in_trades_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TICKER    15 non-null     object 
 1   ACTION    15 non-null     object 
 2   SHARES    15 non-null     int64  
 3   LEVERAGE  13 non-null     float64
 4   PRICE     15 non-null     float64
dtypes: float64(2), int64(1), object(2)
memory usage: 728.0+ bytes


Two missing values in LEVERAGE. Replace with 1:

In [127]:
in_trades_df['LEVERAGE'] = in_trades_df['LEVERAGE'].fillna(1)

In [128]:
tickers_from_trades = in_trades_df['TICKER']
tickers_from_trades.drop_duplicates(inplace=True)
len(tickers_from_trades)

15

## US-SHARES-ISN.csv

In [129]:
in_stocks_df = pd.read_csv(data_directory + 'US-SHARES-ISN.csv')
in_stocks_df.head()

Unnamed: 0,Name,Ticker,SEDOL,ISIN,Exchange
0,1-800-FLOWERS.COM INC. CL A,FLWS,2444123,US68243Q1067,NASDAQ
1,1ST SOURCE CORP.,SRCE,2341848,US3369011032,NASDAQ
2,2U INC,TWOU,BKWBZZ0,US90214J1016,NASDAQ
3,3D SYSTEMS CORP,DDD,2889768,US88554D2053,New York Stock Exchange Inc.
4,3M CO,MMM,2595708,US88579Y1010,New York Stock Exchange Inc.


In [130]:
tickers_from_shares = in_stocks_df['Ticker']
tickers_from_shares.drop_duplicates(inplace=True)
len(tickers_from_shares)

770

## AllowedProducts.xlsx

In [131]:
in_allowed_df = pd.read_excel(data_directory + 'AllowedProducts.xlsx')
in_allowed_df.head()

Unnamed: 0,NAME,TICKER,SEDOL,ISIN,EXCHANGE
0,AMGEN INC,AMGN,2023607,US0311621009,NASDAQ
1,COCA-COLA,KO,2206657,US1912161007,New York Stock Exchange Inc.
2,DOW CHEMICAL,DOW,2278719,US2605431038,New York Stock Exchange Inc.
3,DUKE ENERGY CORP,DUK,B7VD3F2,US26441C2044,New York Stock Exchange Inc.
4,JOHNSON & JOHNSON,JNJ,2475833,US4781601046,New York Stock Exchange Inc.


In [132]:
tickers_from_allowed = in_allowed_df['TICKER']

## Build Stock table

based on tickers referenced in data

In [133]:
all_tickers = set(tickers_from_trades).union(tickers_from_shares).union(tickers_from_allowed)
len(all_tickers)

785

Get name for tickers (this step can take a while. restart kernel and set global variable mock = True to use mock connection)  
Invalid tickers will have value None

In [135]:
ticker_names = track_progress(get_name, all_tickers)
tickers = dict(zip(all_tickers, ticker_names))

100.00 % done

Identify invalid tickers:

In [148]:
for t, n in tickers.items():
    if not n:
        print(f'Invalid ticker: {t}')

Invalid ticker: YHOO
Invalid ticker: FCEA


Remove invalid tickers:

In [149]:
cleaned_tickers = { t:n for t,n in tickers.items() if n }

Create GUIDs for tickers:

In [150]:
tickers_with_id = {t:[str(uuid.uuid4()), n] for t,n in cleaned_tickers.items()}

Create table:

In [151]:
stocks_df = pd.DataFrame.from_dict(tickers_with_id, columns=['fi_id', 'name'], orient='index')
stocks_df['ticker'] = stocks_df.index
stocks_df.head()

Unnamed: 0,fi_id,name,ticker
LLY,302e7ebe-d337-49fb-92b8-cef71bf18f2b,Eli Lilly and Company,LLY
TRIP,cbdfb4ac-59c6-4911-af70-598c20202971,Tripadvisor Inc,TRIP
LIND,cb5dd952-a33f-4860-87e7-22f4ce7c078f,Lindblad Expd Holdings,LIND
SGA,cdcfcf87-8a94-4735-9cb5-387c8de262be,Saga Communications,SGA
TWOU,1b58b775-040a-4d62-8a20-7a4a21946306,2U Inc,TWOU


In [152]:
stocks_df.to_csv(f'{data_directory}{prefix}stock.csv', header=False, index=False)

# Option

Data is generated to demonstrate how that table works. This data is completely made up.

Seed random function to get reproducable results:

In [153]:
random.seed(666)

In [154]:
def generate_option():
    fi_id = str(uuid.uuid4())
    underlying_stock = stocks_df['fi_id'][random.randrange(len(stocks_df))]
    option_type = 'B' if random.randrange(2) else 'S'
    style_code = random.randrange(len(option_styles_df))
    strike_price = random.randrange(100, 200)
    # maturity format YYYY-MM-DD hh:mm:ss
    maturity = f'2022-{str(random.randrange(5, 13)).zfill(2)}-{str(random.randrange(1, 30)).zfill(2)} 18:00:00'
    return [fi_id, underlying_stock, option_type, style_code, strike_price, maturity]

generate some options and store in table:

In [155]:
options = [generate_option() for _ in range(50)]
options_df = pd.DataFrame(options, columns=['fi_id', 'underlying_stock', 'option_type', 'style_code', 'strike_price', 'maturity'])
options_df.head()

Unnamed: 0,fi_id,underlying_stock,option_type,style_code,strike_price,maturity
0,3ead2ab3-94ac-4897-961f-23eeab453704,862e5e6e-acc1-4f6f-ad45-819a8d5823ac,B,1,136,2022-05-26 18:00:00
1,87bf449f-9a29-412a-8ba9-d45b3dec10dc,8c1f5b12-a5df-4d91-9f07-3e58fe30a586,B,0,106,2022-07-13 18:00:00
2,2aa9c6cd-2988-4d35-805e-c7ba9884414a,e85887d4-a350-4673-9449-f003734919bb,B,2,131,2022-05-04 18:00:00
3,55e8a198-68a0-4a35-95cb-38d47d98f20b,5a0c7c2c-7ad7-48a4-b54b-33d6e83e71d1,S,1,187,2022-07-06 18:00:00
4,84383bec-b43b-4110-85eb-e3e47f995627,8650e3f8-9534-44e3-87b0-c80488b84727,B,1,189,2022-09-17 18:00:00


In [156]:
options_df.to_csv(f'{data_directory}{prefix}option.csv', header=False, index=False)

# financial_instrument

Create financial instruments table. Current value is completely made up

Build table from stocks and options

In [157]:
def get_exchange(ticker):
    shares_result = in_stocks_df[in_stocks_df['Ticker'] == ticker]['Exchange']
    if len(shares_result):
        return shares_result.iat[0]
    allowed_result = in_allowed_df[in_allowed_df['TICKER'] == ticker]['EXCHANGE']
    if len(allowed_result):
        return allowed_resul.iat[0]
    online_result = (ticker)
    if online_result:
        return online_result
    else:
        print(f'No exchange found for: {ticker}')
        

In [158]:
get_exchange('TSLA')

'NASDAQ'

In [159]:
stock_exchange_map = { t:get_exchange(t) for t in tickers.keys() }

No exchange found for:

In [160]:
for t,ex in ticker_exchange_map.items():
    if not ex:
        print(t)

Add stocks to financial instruments:

In [161]:
financial_instruments = []
for t, (fi_id, n) in tickers_with_id.items():
    value = random.randint(40, 250)
    financial_instruments.append([fi_id, 0, value, 'USD', ticker_exchange_map[t]])
financial_instruments[:2]

[['302e7ebe-d337-49fb-92b8-cef71bf18f2b',
  0,
  208,
  'USD',
  'New York Stock Exchange Inc.'],
 ['cbdfb4ac-59c6-4911-af70-598c20202971', 0, 111, 'USD', 'NASDAQ']]

Add options to financial instruments:

In [162]:
for o in options:
    fi_id, underlying_stock, option_type, style_code, strike_price, maturity = o
    value = random.randint(40, 250)
    # set exchange to exchange of underlying stock
    underlying_ticker = stocks_df[stocks_df['fi_id'] == underlying_stock]['ticker'].iat[0]
    # use USD as ccy for all. It is deembed reasonable to do that as the list of stocks was labeled as US shares
    financial_instruments.append([fi_id, 1, value, 'USD', ticker_exchange_map[underlying_ticker]])
financial_instruments[-2:]

[['cb81f30c-d4fc-4323-906e-0d465ebabf37',
  1,
  209,
  'USD',
  'New York Stock Exchange Inc.'],
 ['42924330-04a2-485f-9fc1-d142b4ab8630',
  1,
  184,
  'USD',
  'New York Stock Exchange Inc.']]

In [163]:
fi_df = pd.DataFrame(financial_instruments, columns=['fi_id', 'type_code', 'current_value', 'currency_iso', 'exchange'])
fi_df.head()

Unnamed: 0,fi_id,type_code,current_value,currency_iso,exchange
0,302e7ebe-d337-49fb-92b8-cef71bf18f2b,0,208,USD,New York Stock Exchange Inc.
1,cbdfb4ac-59c6-4911-af70-598c20202971,0,111,USD,NASDAQ
2,cb5dd952-a33f-4860-87e7-22f4ce7c078f,0,44,USD,NASDAQ
3,cdcfcf87-8a94-4735-9cb5-387c8de262be,0,146,USD,Nyse Mkt Llc
4,1b58b775-040a-4d62-8a20-7a4a21946306,0,120,USD,NASDAQ


In [164]:
fi_df.to_csv(f'{data_directory}{prefix}financial_instrument.csv', header=False, index=False)

# allowed_fi

Generate permissions for financial instruments and derivatives. Completely made up for demonstration purposes

In [165]:
allowed_fis = []
for emp_id in trader_df['emp_id'][:2]:
    stocks_allowed = random.randrange(8)
    stocks_picked = []
    for _ in range(stocks_allowed):
        # pick random stock
        stock = stocks_df['fi_id'].iat[random.randrange(len(stocks_df))]
        if stock not in stocks_picked: # prevent duplicates
            stocks_picked.append(stock)
            allowed_fis.append([emp_id, stock, 0]) # allow trade of base symbol
            derivative_type = random.randrange(1, len(fi_types_df)) 
            allowed_fis.append([emp_id, stock, derivative_type]) # allow trade of one additional derivative

In [166]:
allowed_fi_df = pd.DataFrame(allowed_fis, columns=['emp_id', 'fi_id', 'type_code'])
allowed_fi_df.head()

Unnamed: 0,emp_id,fi_id,type_code
0,4234,2e8b0410-a561-4914-9185-9ef49ff57067,0
1,4234,2e8b0410-a561-4914-9185-9ef49ff57067,5
2,4234,2870e8d7-8449-47a1-8224-747cdc105b74,0
3,4234,2870e8d7-8449-47a1-8224-747cdc105b74,1
4,4234,b3c5a7a6-6572-4a24-b6bf-4b4760241254,0


In [167]:
allowed_fi_df.to_csv(f'{data_directory}{prefix}allowed_fi.csv', header=False, index=False)

# Trades

Add trades from first_price and generate missing data randomly

In [177]:
in_trades_df.head()

Unnamed: 0,TICKER,ACTION,SHARES,LEVERAGE,PRICE
0,META,B,20001,1.0,912.11
1,DHY,B,401,2.1,2.14
2,FAX,B,2440,3.3,3.28
3,DSU,S,99393,4.0,92.11
4,XOM,B,266,1.0,213.22


In [174]:
trades = []
for i in range(len(in_trades_df)):
    row = in_trades_df.iloc[i]
    ticker, action, shares, leverage, price = row
    trades.append([
        str(uuid.uuid4())                                            #trade_id (generate new id)
        ,trader_df['emp_id'].iat[random.randrange(len(trader_df))] #emp_id (choose random)
        ,stocks_df[stocks_df['ticker'] == ticker]['fi_id'].iat[0]    #fi_id (lookup fi_id from ticker)
        ,cp_df['guid'].iat[random.randrange(len(cp_df))]             #counterparty_id (choose random from available)
        ,f'2022-{str(random.randrange(1, 4)).zfill(2)}-{str(random.randrange(1, 29)).zfill(2)} 14:30:00' #trade_dt (generate randomly) format YYYY-MM-DD hh:mm:ss
        ,action                                                      #action
        ,shares                                                      #quantity
        ,leverage                                                    #leverage
        ,price                                                       #price
        ,'USD'                                                       #currency_iso
        ,0                                                           #margin 
        ,0                                                           #fees
        ,"2022-04-14 16:00:00"                                       #insert_dt (will be auto generated by mysql)
        ,None                                                        #amended_dt (optional)
    ])

In [175]:
trades_df = pd.DataFrame(trades, columns=['trade_id', 'emp_id', 'fi_id', 'counterparty_id', 'trade_dt', 'action', 'quantity', 'leverage', 'price', 'currency_iso', 'margin', 'fees', 'insert_dt', 'amended_dt'])
trades_df.head()

Unnamed: 0,trade_id,emp_id,fi_id,counterparty_id,trade_dt,action,quantity,leverage,price,currency_iso,margin,fees,insert_dt,amended_dt
0,d15fb6b8-56d0-41bb-9536-a7ae400170f4,6203,91e798c3-7194-42dc-b594-9b62715d4fa6,9670d333-1ad5-4c42-b2bf-9dd4f62606ac,2022-02-15 14:30:00,B,20001,1.0,912.11,USD,0,0,2022-04-14 16:00:00,
1,89ea4947-f5d1-442b-a2a5-1aa9f25091e7,2122,8839e632-e62e-44d4-bfb8-8a5fec7b0e92,642afcee-e264-4005-834b-c8dbeb71b588,2022-02-22 14:30:00,B,401,2.1,2.14,USD,0,0,2022-04-14 16:00:00,
2,e732fcf3-b4bc-45ce-8afd-b7ad9c143d27,7534,38e31afa-2218-4e61-88f2-f9d9ae5f98ee,14305fb8-63d6-4133-b04b-f2bb18dde8d2,2022-02-15 14:30:00,B,2440,3.3,3.28,USD,0,0,2022-04-14 16:00:00,
3,85d0c0ec-dca7-4a14-bff3-a51373b024be,9313,d6437e9c-c7fb-4549-a1dd-13232d684db0,642afcee-e264-4005-834b-c8dbeb71b588,2022-03-17 14:30:00,S,99393,4.0,92.11,USD,0,0,2022-04-14 16:00:00,
4,f597643c-aa81-44b7-ac0a-15d949abe232,6068,1d075d23-2b3d-4375-8f7a-ecfabb3aa4cd,76e2aa4f-f11c-4cff-a602-b5fa92301aef,2022-02-24 14:30:00,B,266,1.0,213.22,USD,0,0,2022-04-14 16:00:00,


In [176]:
trades_df.to_csv(f'{data_directory}{prefix}trades.csv', header=False, index=False, na_rep='NULL')