In [1]:
!pip install fuzzywuzzy
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from fuzzywuzzy import fuzz
from collections import Counter
from google.colab import drive
import re
from tqdm import tqdm

drive.mount('/gdrive')

Collecting fuzzywuzzy
  Downloading https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




Mounted at /gdrive


In [2]:
import pandas as pd
data_18Q3 = pd.read_csv('/gdrive/My Drive/capstone/full database/data_18Q3_with_cluster.csv')
etf_info = pd.read_csv('/gdrive/My Drive/capstone/full database/crawled_ETF_info.csv')

In [3]:
len(set(data_18Q3.INSTRID))

10863

In [4]:
len(etf_info)

2357

In [5]:
fuzz.ratio('vanguardemergingmarketsstock','firsttrustnasdaq100techsector')

14

In [6]:
def clean_text(text):
  remove_list = ['fund', 'etf', 'index']
  text = text.lower()
  tokens = word_tokenize(text)
  text = ' '.join([token for token in tokens if token not in remove_list])
  
  # replace some abbreviations
  text = re.sub(r"\btr\b", 'trust', text)
  text = re.sub(r"\bachvrs\b", 'achievers', text)
  text = re.sub('\$', 'usd', text)

  # remove all punc
  text = re.sub('[^A-Za-z0-9]+', '', text)
  return text

print(clean_text('Vanguard Emerging Markets Stock Index Fund;ETF'))

vanguardemergingmarketsstock


In [7]:
instr_df = data_18Q3.loc[:,['INSTRID','COMNAME']]
instr_df = instr_df.drop_duplicates(subset='INSTRID')
print(len(instr_df))
instr_df = instr_df.dropna()
print(len(instr_df))

instr_id_list = instr_df.INSTRID.to_list()
instr_name_list = instr_df.COMNAME.to_list()

etf_name_list = etf_info.name.to_list()
etf_clean_name_list = [clean_text(x) for x in etf_name_list]

10863
10862


In [121]:
# match the ETF names with the instrument names
match_flag_list = []
match_segment_list = []
match_ticker_list = []
match_issuer_list = []
for instr_name in tqdm(instr_name_list):
  flag = False
  instr_name_clean = clean_text(instr_name)
  for eft in etf_clean_name_list:
    highest_ratio = 0
    fuzz_ratio = fuzz.ratio(eft,instr_name_clean)
    if fuzz_ratio >= 90 and fuzz_ratio > highest_ratio:
      highest_ratio = fuzz_ratio
      # if instrument matched in etf list
      flag = True
      match_index = etf_clean_name_list.index(eft)
      match_segment = etf_info.loc[match_index, 'segment']
      match_ticker = etf_info.loc[match_index, 'ticker']
      match_issuer = etf_info.loc[match_index, 'issuer']
  if flag == True:
    # if instrument matched in etf list
    match_flag_list.append(True)
    match_segment_list.append(match_segment)
    match_ticker_list.append(match_ticker)
    match_issuer_list.append(match_issuer)
  else:
    # if instrument not matched in etf list
    match_flag_list.append(False)
    match_segment_list.append(None)
    match_ticker_list.append(None)
    match_issuer_list.append(None)

instr_match_ETF_df = pd.DataFrame({
    'instr_id': instr_id_list,
    'instr_name': instr_name_list,
    'ETF_flag': match_flag_list,
    'ETF_segment': match_segment_list,
    'ETF_ticker': match_ticker_list,
    'ETF_issuer': match_issuer_list
})
print('number of instruments matched as ETF:', len([flag for flag in match_flag_list if flag]))

100%|██████████| 10862/10862 [36:24<00:00,  4.97it/s]


number of instruments matched as ETF: 1663


In [122]:
len(instr_name_list)

10862

In [123]:
# save csv to drive
with open('/gdrive/My Drive/capstone/full database/instr_match_ETF.csv', 'w') as f:
  instr_match_ETF_df.to_csv(f)