# Description

- port_ID of large sparse matrix will be converted into info array including all relevant falgs/ obj_cds for each individual portfolio

In [85]:
import feather

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import sparse

# Functions

In [86]:
def port_ID_split(port_ID):
    if len(port_ID) == 18:
        report_dt = pd.Timestamp(port_ID[8:])
        port_no = pd.to_numeric(port_ID[:7])
        return(port_no,report_dt)

In [371]:
def port_ID_to_port_info(port_ID,info_df):
    if len(port_ID) == 18:
        
        port_no, report_dt = port_ID_split(port_ID)
        
        try: 
            mask = info_df['crsp_portno'].values == port_no
            my_class = info_df.loc[mask]

            my_class_n = my_class.loc[
                (my_class.begdt <= report_dt) & 
                (my_class.enddt >= report_dt)]

            crsp_obj_cd = my_class_n['crsp_obj_cd'].values[0]
            index_fund_flag = my_class_n['index_fund_flag'].values[0]
            et_flag = my_class_n['et_flag'].values[0]

            return(port_ID, port_no, report_dt, index_fund_flag, et_flag, crsp_obj_cd)        

        except: return(port_ID,port_no,report_dt,np.nan,np.nan,np.nan)
    else: return(np.nan,np.nan,np.nan,np.nan,np.nan,np.nan)

In [8]:
def port_map_to_port_ID(port_map):
    left = []
    right = []
    keys = list(port_no_map.keys())
    for i in keys:
        left_temp, right_temp = i.split('-')
        left.append(left_temp)
        right.append(right_temp)

    int_date = pd.to_numeric(right)
    date = pd.to_timedelta(int_date, unit='D') + pd.Timestamp('1960-1-1')

    port_ID_new = [m + '-' + str(n.date()) for m,n in zip(left,date)]
    return(port_ID_new)

# Data

In [9]:
path = 'data/total_summary.feather'
info = feather.read_dataframe(path)

In [10]:
stock_map = np.load('data/stock_map.npy') 
port_no_map_a = np.load('data/port_no_map.npy') 
port_no_map = port_no_map_a.item()

In [11]:
port_no_map = port_map_to_port_ID(port_no_map)

In [12]:
print('Shape before cleaning is: {:,} / {:,}'.format(info.shape[0],info.shape[1]))

Shape before cleaning is: 166,461 / 12


In [136]:
info_clean = info[info['crsp_portno'].notna()]
info_clean = info_clean.drop_duplicates(info_clean.columns.difference(['crsp_fundno','fund_name']))
info_clean.loc[:,'crsp_portno'] = pd.to_numeric(info_clean.loc[:,'crsp_portno'], downcast='integer')
# info_clean = info_clean.set_index('crsp_portno')

In [133]:
print('Shape after cleaning is: {:,} / {:,}'.format(info_clean.shape[0],info_clean.shape[1]))

Shape after cleaning is: 95,614 / 12


# Match obj code to port_ID

In [398]:
%%time

test = [port_ID_to_port_info(x,info_clean) for x in port_no_map[-200_000:]]

CPU times: user 6min 21s, sys: 1.29 s, total: 6min 22s
Wall time: 6min 23s


In [545]:
labels = ['port_ID','port_no','report_dt','index_fund_flag','et_flag','crsp_obj_cd']

In [546]:
info_df = pd.DataFrame.from_records(test, columns=labels)

In [547]:
info_df.head()

Unnamed: 0,port_ID,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd
0,1028236-2010-09-30,1028236.0,2010-09-30,D,F,EDCM
1,1028236-2010-10-31,1028236.0,2010-10-31,D,F,EDCM
2,1028236-2010-11-30,1028236.0,2010-11-30,D,F,EDCM
3,1028236-2010-12-31,1028236.0,2010-12-31,D,F,EDCM
4,1028236-2011-01-31,1028236.0,2011-01-31,D,F,EDCM


In [548]:
info_df[info_df['crsp_obj_cd'].isna()].shape # -> For some portfolios there is simly no row in fund_header

(1143, 6)

# Clean data

In [549]:
# Replace NaN with N for not an ETF
info_df.loc[info_df['et_flag'].isna(),'et_flag'] = 'N'

In [550]:
# Replace NaN with N for not an Index fund
info_df.loc[info_df['index_fund_flag'].isna(),'index_fund_flag'] = 'N'

In [551]:
# Creat new flag var 'mutual_fund'
info_df.loc[(info_df['index_fund_flag'] == 'N') & (info_df['et_flag'] == 'N'),'mutual_fund'] = 'Y'
info_df.loc[info_df['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [556]:
# Creat subsample of crsp_obj_cd
most_common = (info_df["port_no"]
        .groupby([
        info_df["crsp_obj_cd"],
        ])
        .count()
        .sort_values()[-20:]
        .index
)

info_df.loc[info_df["crsp_obj_cd"].isin(most_common),'s_crsp_obj_cd'] = info_df["crsp_obj_cd"]
info_df.loc[info_df["s_crsp_obj_cd"].isna(),'s_crsp_obj_cd'] = 'Other'

info_df.sample(5)

Unnamed: 0,port_ID,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd,mutual_fund,s_crsp_obj_cd
165136,1032558-2016-11-30,1032558.0,2016-11-30,B,F,O,N,O
112546,1030897-2014-05-31,1030897.0,2014-05-31,N,N,I,Y,I
77294,1029859-2013-08-31,1029859.0,2013-08-31,N,N,EDYB,Y,EDYB
75948,1029841-2015-07-31,1029841.0,2015-07-31,D,F,EFSF,N,Other
182137,1033328-2015-12-31,1033328.0,2015-12-31,N,N,I,Y,I


In [553]:
info_df.loc[info_df['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [554]:
info_df.head()

Unnamed: 0,port_ID,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd,mutual_fund,s_crsp_obj_cd
0,1028236-2010-09-30,1028236.0,2010-09-30,D,F,EDCM,N,EDCM
1,1028236-2010-10-31,1028236.0,2010-10-31,D,F,EDCM,N,EDCM
2,1028236-2010-11-30,1028236.0,2010-11-30,D,F,EDCM,N,EDCM
3,1028236-2010-12-31,1028236.0,2010-12-31,D,F,EDCM,N,EDCM
4,1028236-2011-01-31,1028236.0,2011-01-31,D,F,EDCM,N,EDCM


# Save info_df

In [555]:
path = 'data/info_df_total.feather'
feather.write_dataframe(info_df,path)