## Description

Descirption:

Filter the three main data sources (Fund Info, Returns and Holdings)
based on some parameters and save the result.

Approach:
1. Match fund summary to each portfolio/date pair
2. Filter fund/date pairs based on those infos
3. Filter returns based on final sample of holdings

Parameters: 
- Obj_codes
- Percentage
- Flags

In [1]:
import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
from data.basic_functions import * 

In [2]:
import numpy as np
import pandas as pd

from scipy import sparse

# For multiprocessing
import multiprocessing
from itertools import product

### Load the data files

In [12]:
data_path = '../data/raw/'

returns = load_data(data_path,'monthly_returns')
summary = load_data(data_path,'total_summary_new') # Wo kommt das her ???

In [13]:
npz_path = '../data/interim/sparse_matrix_t.npz'
holdings = sparse.load_npz(npz_path)
holdings.shape

(733743, 2382968)

In [14]:
data_path = '../data/interim/'
holdings_summary = load_data(data_path,'sparse_info_t')
holdings_summary.shape

(733743, 2)

## Drop duplicates in summary data

To increase speed of matching obj_codes and other fund info to portfolios

In [15]:
print('Shape of summary before cleaning is: {:,} / {:,}'.format(summary.shape[0],summary.shape[1]))

Shape of summary before cleaning is: 167,176 / 8


In [16]:
# Delet not nas
summary_clean = summary[summary['crsp_portno'].notna()]

# Drop duplicates based on all but two columns
summary_clean = summary_clean.drop_duplicates(summary_clean.columns.difference(['crsp_fundno','fund_name']))

summary_clean.loc[:,'crsp_fundno'] = pd.to_numeric(summary_clean.loc[:,'crsp_fundno'], downcast='integer')
summary_clean.loc[:,'crsp_portno'] = pd.to_numeric(summary_clean.loc[:,'crsp_portno'], downcast='integer')

In [17]:
print('Shape of summary data after cleaning: {:,} / {:,}'.format(summary_clean.shape[0],summary_clean.shape[1]))

Shape of summary data after cleaning: 96,077 / 8


# Match obj code to portfolios

TODO: 

Fund info flags like the index_fund_flag could in theory also be different in the fund_history

Therefore a similar approach should also be used for those items

In general portno fundno map also beginning and end times

In [18]:
def port_ID_to_port_info(fund_info):  
            """
            Used to merge the right obj_code and other fund info 
            to each holdings_info row and therefore to each row of the sparse holdings matrix
            
            Input:
            - fund_info: Tuple consisting of the port_no (1st element) and the report_dt (2nd)
            
            Output:
            - Tuple of port_no, report_dt, index_fund_flag, et_flag and crsp_obj_cd. 
                NaN if value is not available
            
            Attention:
            Depends on global variable summary to look up the values
            Must be renamed or changed in the function
            """
            port_no = fund_info[0]
            report_dt = fund_info[1]
            mask = summary['crsp_portno'].values == port_no
            my_class = summary.loc[mask]

            my_class_n = my_class.loc[
                (my_class.begdt <= report_dt) & 
                (my_class.enddt >= report_dt)]
            
            try:
                crsp_obj_cd = my_class_n['crsp_obj_cd'].values[0]
                index_fund_flag = my_class_n['index_fund_flag'].values[0]
                et_flag = my_class_n['et_flag'].values[0]

            except:
                crsp_obj_cd = np.nan
                index_fund_flag = np.nan
                et_flag = np.nan
                
                
            return(port_no, report_dt, index_fund_flag, et_flag, crsp_obj_cd)        

### Multiprocessing

In [19]:
a = holdings_summary['port_no']
b = holdings_summary['date']

fund_info = list(zip(a,b))

In [None]:
%%time
with multiprocessing.Pool(processes=8) as pool:
    results = pool.map(port_ID_to_port_info, fund_info)

In [21]:
labels = ['port_no','report_dt','index_fund_flag','et_flag','crsp_obj_cd']
holdings_summary = pd.DataFrame.from_records(results, columns=labels)

holdings_summary.shape

(733743, 5)

In [26]:
holdings_summary[holdings_summary['crsp_obj_cd'].isna()].shape # -> For some portfolios there is simly no row in fund_header

(129400, 5)

### Out of the roughly 730k portfolios, for 129k there is no fund header info available

In [27]:
holdings_summary.sample(10)

Unnamed: 0,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd
320803,1024065,2013-01-31,,,EF
175797,1021290,2011-08-31,,,EF
298427,1023604,2017-12-31,,,EDCM
404401,1025672,2015-05-31,,,EDYG
642137,1030742,2013-09-30,D,F,IF
454063,1026605,2016-09-30,,,EDCM
681492,1031930,2017-02-28,B,,M
368302,1024971,2012-01-31,,,EDYG
718518,1033410,2016-07-31,,,EDYB
26974,1000908,2004-03-31,,,


# Clean holdings_summary data

Do not delet rows since they match to the sparse matrix

TODO: Replace all falgs with proper categories

In [34]:
holdings_summary[['et_flag','index_fund_flag']] = holdings_summary[['et_flag','index_fund_flag']].astype('category')

In [53]:
et_mapper = {'F':'ETF', 'N':'ETN', np.nan:'MF'}
holdings_summary['et_flag'] = holdings_summary['et_flag'].map(et_mapper)

In [58]:
index_flag_mapper = {'B':'Index-based', 'D':'Pure Index', 'E':'Index edhanced', np.nan:'MF'}
holdings_summary['index_fund_flag'] = holdings_summary['index_fund_flag'].map(index_flag_mapper)

In [67]:
holdings_summary.describe(include='all')

Unnamed: 0,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd
count,733743.0,733743,733743,733743,604343
unique,,216,1,1,55
top,,2014-03-31 00:00:00,MF,MF,EDYG
freq,,8829,733743,733743,92261
first,,1960-01-01 00:00:00,,,
last,,2018-06-30 00:00:00,,,
mean,1022105.0,,,,
std,9742.804,,,,
min,0.0,,,,
25%,1021414.0,,,,


In [None]:
# Replace NaN with N for not an ETF
holdings_summary.loc[holdings_summary['et_flag'].isna(),'et_flag'] = 'mf'

In [None]:
# Replace NaN with N for not an Index fund
holdings_summary.loc[holdings_summary['index_fund_flag'].isna(),'index_fund_flag'] = 'N'

In [None]:
# Creat new flag var 'mutual_fund'
holdings_summary.loc[(holdings_summary['index_fund_flag'] == 'N') & (holdings_summary['et_flag'] == 'N'),'mutual_fund'] = 'Y'
holdings_summary.loc[holdings_summary['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [None]:
# Creat subsample of crsp_obj_cd
most_common = (holdings_summary["port_no"]
        .groupby([
        holdings_summary["crsp_obj_cd"],
        ])
        .count()
        .sort_values()[-20:]
        .index
)

holdings_summary.loc[holdings_summary["crsp_obj_cd"].isin(most_common),'s_crsp_obj_cd'] = holdings_summary["crsp_obj_cd"]
holdings_summary.loc[holdings_summary["s_crsp_obj_cd"].isna(),'s_crsp_obj_cd'] = 'Other'

holdings_summary.sample(5)

In [None]:
holdings_summary.loc[holdings_summary['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [None]:
holdings_summary.head()

# Save holdings_summary

In [None]:
path = 'data/info_df_total.feather'
feather.write_dataframe(info_df,path)

### Restrict to last 200k lines

In [None]:
holdings_s = holdings[-200_000:] # TODO -> anderes skript implementieren

### Shape

In [None]:
holdings_s.shape

In [None]:
info_matched.shape

### Selection criteria

In [None]:
selected_obj_codes = ('EDYG','EDYB' ,'EDYH' ,'EDYS' ,'EDYI')
selected_portion = 0.1

### Select funds from summary

First select the funds based on their characteristics in the fund_summary table

TODO: Flags

Then select the matching fund returns and fund holdings

In [None]:
summary.head()

In [None]:
# Implement multiple masks
mask_1 = summary['crsp_obj_cd'].isin(selected_obj_codes) # only certain obj_codes
# mask_2 = summary['mutual_fund'] == 'Y' # only mutual funds TODO needs to be implemented
# (info_df['index_fund_flag'] == 'N') & (info_df['et_flag'] == 'N')

# info_df.loc[(info_df['index_fund_flag'] == 'N') & (info_df['et_flag'] == 'N'),'mutual_fund'] = 'Y'
# info_df.loc[info_df['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [None]:
summary_s = summary.loc[mask_1]

In [None]:
summary.drop_duplicates?

### Filter returns

In [None]:
unique_fundnos = summary_s['crsp_fundno'].drop_duplicates()

In [None]:
mask = returns['crsp_fundno'].isin(unique_fundnos)
returns_s = returns[mask]

### Filter holdings

In [None]:
train_mask = (info_df['port_no'].isin(list(X_train_s))))

info_train = info_df[train_mask]
info_test = info_df[test_mask]

sparse_matrix_train = sparse_matrix_ss[train_mask.values,:]
sparse_matrix_test = sparse_matrix_ss[test_mask.values,:]

### Save final cleaned and filtered data