## Description

Description:

Filter the three main data sources (Fund Info, Returns and Holdings)
based on some parameters and save the result.

Approach:
1. Match fund summary to each portfolio/date pair
2. Filter fund/date pairs based on those infos
3. Filter returns based on final sample of holdings

Parameters: 
- Obj_codes
- Percentage
- Flags

In [2]:
%matplotlib inline

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
from data.basic_functions import * 

import numpy as np
import pandas as pd

from scipy import sparse

import matplotlib.pyplot as plt

# For multiprocessing
import multiprocessing
from itertools import product

### Load the data files

In [3]:
data_path = '../data/raw/'

returns = load_data(data_path,'monthly_returns')
summary = load_data(data_path,'total_summary_new') # Wo kommt das her ???

In [4]:
npz_path = '../data/interim/sparse_matrix_t.npz'
holdings = sparse.load_npz(npz_path)
holdings.shape

(733743, 2382968)

In [5]:
data_path = '../data/interim/'
holdings_summary = load_data(data_path,'sparse_info_t')
holdings_summary.shape

(733743, 2)

In [6]:
path = '../data/raw/portno_map.feather'
portno_map = feather.read_dataframe(path)

## Drop duplicates in summary data

To increase speed of matching obj_codes and other fund info to portfolios

In [7]:
print('Shape of summary before cleaning is: {:,} / {:,}'.format(summary.shape[0],summary.shape[1]))

Shape of summary before cleaning is: 167,176 / 8


In [8]:
# Delet not nas
summary_clean = summary[summary['crsp_portno'].notna()]

# Drop duplicates based on all but two columns
summary_clean = summary_clean.drop_duplicates(summary_clean.columns.difference(['crsp_fundno','fund_name']))

summary_clean.loc[:,'crsp_fundno'] = pd.to_numeric(summary_clean.loc[:,'crsp_fundno'], downcast='integer')
summary_clean.loc[:,'crsp_portno'] = pd.to_numeric(summary_clean.loc[:,'crsp_portno'], downcast='integer')

In [9]:
print('Shape of summary data after cleaning: {:,} / {:,}'.format(summary_clean.shape[0],summary_clean.shape[1]))

Shape of summary data after cleaning: 96,077 / 8


# Match obj code to portfolios

TODO: 

Fund info flags like the index_fund_flag could in theory also be different in the fund_history

Therefore a similar approach should also be used for those items

In general portno fundno map also beginning and end times

In [10]:
def port_ID_to_port_info(fund_info):  
            """
            Used to merge the right obj_code and other fund info 
            to each holdings_info row and therefore to each row of the sparse holdings matrix
            
            Input:
            - fund_info: Tuple consisting of the port_no (1st element) and the report_dt (2nd)
            
            Output:
            - Tuple of port_no, report_dt, index_fund_flag, et_flag and crsp_obj_cd. 
                NaN if value is not available
            
            Attention:
            Depends on global variable summary to look up the values
            Must be renamed or changed in the function
            """
            port_no = fund_info[0]
            report_dt = fund_info[1]
            mask = summary['crsp_portno'].values == port_no
            my_class = summary.loc[mask]

            my_class_n = my_class.loc[
                (my_class.begdt <= report_dt) & 
                (my_class.enddt >= report_dt)]
            
            try:
                crsp_obj_cd = my_class_n['crsp_obj_cd'].values[0]
                index_fund_flag = my_class_n['index_fund_flag'].values[0]
                et_flag = my_class_n['et_flag'].values[0]

            except:
                crsp_obj_cd = np.nan
                index_fund_flag = np.nan
                et_flag = np.nan
                
                
            return(port_no, report_dt, index_fund_flag, et_flag, crsp_obj_cd)        

### Multiprocessing

In [11]:
a = holdings_summary['port_no']
b = holdings_summary['date']

fund_info = list(zip(a,b))

In [12]:
%%time
with multiprocessing.Pool(processes=8) as pool:
    results = pool.map(port_ID_to_port_info, fund_info)

CPU times: user 5.07 s, sys: 502 ms, total: 5.57 s
Wall time: 5min 24s


In [13]:
labels = ['port_no','report_dt','index_fund_flag','et_flag','crsp_obj_cd']
holdings_summary = pd.DataFrame.from_records(results, columns=labels)

holdings_summary.shape

(733743, 5)

In [14]:
holdings_summary[holdings_summary['crsp_obj_cd'].isna()].shape # -> For some portfolios there is simly no row in fund_header

(129400, 5)

### Out of the roughly 730k portfolios, for 129k there is no fund header info available

In [15]:
holdings_summary.sample(10)

Unnamed: 0,port_no,report_dt,index_fund_flag,et_flag,crsp_obj_cd
636253,1030515,2012-10-31,,,EF
13074,1000426,2003-06-30,,,
203538,1021838,2013-05-31,,,EF
359350,1024785,2017-01-31,,,EF
583388,1029226,2012-09-30,,,EDYB
659426,1031231,2014-02-28,B,,M
201872,1021799,2015-05-31,,,I
543628,1028368,2014-06-30,,,EFSN
327985,1024203,2014-07-31,,,EDYG
33144,1001117,2010-04-29,,,


# Clean holdings_summary data

Do not delet rows since they match to the sparse matrix

TODO: Replace all falgs with proper categories

In [16]:
# Make the two flags categories and rename those categories accordingly
holdings_summary[['et_flag','index_fund_flag']] = holdings_summary[['et_flag','index_fund_flag']].astype('category')

et_mapper = {'F':'ETF', 'N':'ETN', np.nan:'MF'}
holdings_summary['et_flag'] = holdings_summary['et_flag'].map(et_mapper)

index_flag_mapper = {'B':'Index-based', 'D':'Pure Index', 'E':'Index enhanced', np.nan:'MF'}
holdings_summary['index_fund_flag'] = holdings_summary['index_fund_flag'].map(index_flag_mapper)

In [17]:
# Creat new flag var 'mutual_fund' that is Y for Mutual Funds and N for other funds
holdings_summary.loc[(holdings_summary['index_fund_flag'] == 'MF') & 
                     (holdings_summary['et_flag'] == 'MF'),'mutual_fund'] = 'Y'
holdings_summary.loc[holdings_summary['mutual_fund'].isna(),'mutual_fund'] = 'N'

In [18]:
# Creat new flag var 'sample' that is Y for those included and N for those not included
# , 'EDCL', 'EDCM', 'EDCS', 'EDCI'
selected_obj_codes = ('EDYG','EDYB' ,'EDYH' ,'EDYS' ,'EDYI')

holdings_summary.loc[(holdings_summary['mutual_fund'] == 'Y') & 
                     (holdings_summary['crsp_obj_cd'].isin(selected_obj_codes)),'sample'] = 'Y'
holdings_summary.loc[holdings_summary['sample'].isna(),'sample'] = 'N'

In [19]:
# Make the two new variables categorical
holdings_summary[['mutual_fund','sample']] = holdings_summary[['mutual_fund','sample']].astype('category')

In [20]:
holdings_summary['sample'].value_counts()

N    585680
Y    148063
Name: sample, dtype: int64

In [21]:
pd.crosstab(holdings_summary['et_flag'],holdings_summary['index_fund_flag'])

index_fund_flag,Index enhanced,Index-based,MF,Pure Index
et_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ETF,5772,10996,4065,76754
MF,5100,11586,599052,20418


In [22]:
pd.crosstab(holdings_summary['mutual_fund'],holdings_summary['sample'])

sample,N,Y
mutual_fund,Unnamed: 1_level_1,Unnamed: 2_level_1
N,134691,0
Y,450989,148063


#### 148k Funds in EDY
#### 202k Funds in EDC

# Save holdings_summary

In [23]:
path = '../data/interim/holdings_summary_total.feather'
#feather.write_dataframe(holdings_summary,path)

In [8]:
path = '../data/interim/holdings_summary_total.feather'
holdings_summary = feather.read_dataframe(path)

In [9]:
holdings.shape

(733743, 2382968)

In [10]:
holdings_summary.shape

(733743, 7)

### Add fund_no to holdings_summary

#### Fundo is not an integer for now but not that important -> TODO

In [11]:
portno_map_unique = portno_map.drop_duplicates(subset='crsp_portno')

#### Maybe must be modified since all but one associated fund_nos per portfolio are deleted 

In [12]:
holdings_summary = holdings_summary.merge(portno_map_unique[['crsp_portno','crsp_fundno']],how='left', left_on = 'port_no', right_on='crsp_portno')

In [13]:
holdings_summary.shape

(733743, 9)

In [14]:
mask = holdings_summary['crsp_fundno'].notna()
holdings_summary['crsp_fundno'] = holdings_summary.loc[mask,'crsp_fundno'].astype(int)

In [15]:
new_order = [0,8,1,2,3,4,5,6]
holdings_summary = holdings_summary[holdings_summary.columns[new_order]]

In [16]:
holdings_summary = holdings_summary.rename(columns={'crsp_fundno':'fund_no'}, index=str)

# Take sample according to parameter

### Filter returns

In [17]:
unique_portno = holdings_summary[['fund_no']].drop_duplicates()

In [18]:
mask = returns['crsp_fundno'].isin(unique_portno['fund_no'])

In [19]:
returns_s = returns[mask]

In [20]:
returns.shape

(7273320, 3)

In [21]:
returns_s.shape

(2173160, 3)

#### From 7.2m to 2.1m return datapoints

# Filter holdings

### Mask to filter out only those in the sample according to holdings_summary

In [101]:
mask = (holdings_summary['sample'] == 'Y') 
np.sum(mask)

148063

In [102]:
mask = (holdings_summary['sample'] == 'Y') & (holdings_summary['port_no'].notna())
np.sum(mask)

148063

In [103]:
holdings_s = holdings[mask.values]
holdings_s

<148063x2382968 sparse matrix of type '<class 'numpy.float64'>'
	with 16140657 stored elements in Compressed Sparse Row format>

### Filter holdings summary

In [104]:
holdings_summary_s = holdings_summary[mask]

In [105]:
holdings_s.shape

(148063, 2382968)

In [106]:
holdings_summary_s.shape

(148063, 8)

### Take only last n number of obs per fund

To avoid overweight of funds with many observations

In [107]:
holdings_summary_s = holdings_summary_s.reset_index()
index = pd.DataFrame(np.arange(holdings_summary_s.shape[0]))
index = index.groupby(holdings_summary_s['port_no']).tail(10)
index = index.values.T.flatten()

In [108]:
holdings_summary_s = holdings_summary_s.reset_index().loc[index,:]
holdings_s = holdings_s[index]
holdings_s.shape

(35791, 2382968)

### Delet columns with little to no information

In [109]:
holdings_b = sparse.csr_matrix(holdings_s, copy=True)
holdings_b.data = np.ones(len(holdings_s.data))

In [110]:
col_sums = pd.DataFrame(holdings_b.sum(0)).values.flatten()

1.5m of the 2m securities have either zero or one entry 

In [111]:
# generate mask
mask = col_sums > 1000

In [112]:
holdings_s = holdings_s.tocsc()
holdings_s = holdings_s[:,mask]
holdings_s = holdings_s.tocsr()
holdings_s

<35791x722 sparse matrix of type '<class 'numpy.float64'>'
	with 1901676 stored elements in Compressed Sparse Row format>

In [113]:
holdings_b = holdings_b.tocsc()
holdings_b = holdings_b[:,mask]
holdings_b = holdings_b.tocsr()
holdings_b

<35791x722 sparse matrix of type '<class 'numpy.float64'>'
	with 1901676 stored elements in Compressed Sparse Row format>

### Save final cleaned and filtered data

#### Sparse matrix

In [114]:
path = '../data/processed/EDY/holdings_s'
sparse.save_npz(path, holdings_s)

In [115]:
path = '../data/processed/EDY/holdings_b'
sparse.save_npz(path, holdings_b)

#### Sparse info

In [116]:
path = '../data/processed/EDY/holdings_summary_s.feather'
feather.write_dataframe(holdings_summary_s,path)

#### Returns

In [117]:
path = '../data/processed/EDY/returns_s.feather'
feather.write_dataframe(returns_s,path)

## Take smaller sub_sub sample (Everything before specified year)
Makes processing faster

In [118]:
start_date = '2015-01-01'
end_date = '2018-01-01'

#### Holdings & holdings_summary

In [119]:
mask = (holdings_summary_s['report_dt'] > start_date) & (holdings_summary_s['report_dt'] < end_date)

In [120]:
holdings_s_s = holdings_s[mask.values]

In [121]:
holdings_summary_s_s = holdings_summary_s[mask]

#### Test

In [122]:
holdings_s_s.shape

(18383, 722)

In [123]:
holdings_summary_s_s.shape

(18383, 10)

#### Returns

In [124]:
mask = (returns_s['caldt'] > start_date) & (returns_s['caldt'] < end_date)
returns_s_s = returns_s[mask]

### Save final cleaned and filtered data

#### Sparse matrix

In [125]:
path = '../data/processed/EDY/holdings_s_s'
sparse.save_npz(path, holdings_s_s)

#### Sparse info

In [126]:
path = '../data/processed/EDY/holdings_summary_s_s.feather'
feather.write_dataframe(holdings_summary_s_s,path)

#### Returns

In [127]:
path = '../data/processed/EDY/returns_s_s.feather'
feather.write_dataframe(returns_s_s,path)