In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tslearn.utils import to_time_series_dataset
from tslearn.utils import to_time_series
import tslearn.metrics as tm

from collections import defaultdict

In [2]:
df = pd.read_csv('Results_20160630_20200630.csv')
df['LGCYINVESTORID'] = df['LGCYINVESTORID'].astype(str)
df['INSTRID'] = df['INSTRID'].astype(str)
df['HOLDDATE'] = df['HOLDDATE'].str.replace(r' 00:00:00', '')

In [56]:
df_target = pd.read_csv('ListOfInvestor.csv')
df_target['LgcyInvestorID'] = df_target['LgcyInvestorID'].astype(str)

In [82]:
df_comp = df[['LGCYINVESTORID', 'FULLNAME']].drop_duplicates()
df_target_comp = pd.merge(df_target, df_comp, how='left', left_on='LgcyInvestorID', right_on='LGCYINVESTORID').drop(columns=['LgcyInvestorID'])

In [4]:
quaters = set(df['HOLDDATE'])

In [21]:
def get_time_series(comp_id):
    res = defaultdict(list)
    df_comp = df[df['LGCYINVESTORID']==comp_id]
    data = pd.pivot_table(df_comp, index=['HOLDDATE'], columns=['INSTRID'], values=['PCTPORTFOLIO']).fillna(0)
    
    # missing check
    quaters_investor = set(df_comp['HOLDDATE'])
    for t in (quaters - quaters_investor):
        data.loc[t] = 0
    data = data.sort_index()
    
    for col in data.columns:
        res[col[1]] = data[col].to_list()
    return res

In [23]:
time_series_dict = defaultdict(dict)
for i in range(len(df_target)):
    comp_id = df_target['LgcyInvestorID'][i]
    time_series_dict[comp_id] = get_time_series(comp_id)

In [None]:
res = pd.DataFrame()

In [135]:
def get_ts_sim(ts1, ts2):
    sim = 0.0
    commom_key = {key for key in ts1.keys() & ts2.keys() }
    for key in commom_key:
        sim += tm.dtw(ts1[key], ts2[key])
    return 1.0/sim if sim != 0.0 else 0.0

def get_ts_sim_2(ts1, ts2):
    sim = 0.0
    common_key = {key for key in ts1.keys() & ts2.keys() }
    for key in common_key:
        sim += tm.dtw(ts1[key], ts2[key])
    for key in (ts1.keys() - common_key):
        sim += tm.dtw(ts1[key], [0]*len(quaters))
    return sim

def get_similar_comp(target, time_series_dict):
    res = defaultdict(float)
    target_ts = time_series_dict[target]
    for cp in (set(df_target['LgcyInvestorID']) - set([target])):
        cp_ts = time_series_dict[cp]
        sim = get_ts_sim_2(target_ts, cp_ts)
        res[cp] = sim
    res = pd.DataFrame(res.items())
    res.columns = ['LgcyInvestorID', 'Distance']
    res = res.sort_values(by=['Distance'], ascending=True).reset_index(drop=True)
    return res

In [147]:
for i in range(len(df_target_comp)):
    company = get_similar_comp(df_target_comp['LGCYINVESTORID'][i], time_series_dict)
    company_res = pd.merge(company, df_comp, how='left', left_on='LgcyInvestorID', right_on='LGCYINVESTORID').drop(columns=['LgcyInvestorID'])
    company_res.to_csv('./sim_res/'+df_target_comp['FULLNAME'][i]+'.csv')

In [139]:
# vanguard = get_similar_comp('2004260', time_series_dict)
# vanguard_res = pd.merge(vanguard, df_comp, how='left', left_on='LgcyInvestorID', right_on='LGCYINVESTORID').drop(columns=['LgcyInvestorID'])
# vanguard_res.to_csv('vangurad_time_sim.csv')

In [122]:
# citadel = get_similar_comp('2006452', time_series_dict)
# citadel_res = pd.merge(citadel, df_comp, how='left', left_on='LgcyInvestorID', right_on='LGCYINVESTORID').drop(columns=['LgcyInvestorID'])
# citadel_res.to_csv('citadel_time_sim.csv')