## Description

Description:

Filter the three main data sources (Fund Info, Returns and Holdings)
based on some parameters and save the result.

Approach:
1. Match fund summary to each portfolio/date pair
2. Filter fund/date pairs based on those infos
3. Filter returns based on final sample of holdings

Parameters: 
- Obj_codes
- Percentage
- Flags

In [1]:
%matplotlib inline

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
from data.basic_functions import * 

import numpy as np
import pandas as pd

from scipy import sparse

import matplotlib.pyplot as plt

import datetime

# For multiprocessing
import multiprocessing
from itertools import product

### Load the data files

In [237]:
data_path = '../data/raw/'

returns = load_data(data_path,'monthly_returns') 
summary = load_data(data_path,'total_summary_new') # Wo kommt das her ???
summary.shape

(167352, 6)

In [238]:
summary.sample()

Unnamed: 0,crsp_fundno,si_obj_cd,wbrger_obj_cd,policy,lipper_class,avrcs
43375,12900.0,,,,LCCE,93.429846


In [None]:
summary[]

In [3]:
npz_path = '../data/interim/sparse_matrix_t.npz'
holdings = sparse.load_npz(npz_path)
holdings.shape

(733743, 2382968)

In [4]:
data_path = '../data/interim/'
holdings_summary = load_data(data_path,'sparse_info_t')
holdings_summary.shape

(733743, 2)

In [5]:
path = '../data/raw/portno_map.feather'
portno_map = feather.read_dataframe(path)

## Filter Summary Table based on WRDS methodology

## Drop duplicates in summary data

To increase speed of matching obj_codes and other fund info to portfolios

In [14]:
print('Shape of summary before cleaning is: {:,} / {:,}'.format(summary.shape[0],summary.shape[1]))

Shape of summary before cleaning is: 132,023 / 8


In [15]:
# Delet not nas
summary_clean = summary[summary['crsp_portno'].notna()]

# Drop duplicates based on all but two columns
summary_clean = summary_clean.drop_duplicates(summary_clean.columns.difference(['crsp_fundno','fund_name']))

summary_clean.loc[:,'crsp_fundno'] = pd.to_numeric(summary_clean.loc[:,'crsp_fundno'], downcast='integer')
summary_clean.loc[:,'crsp_portno'] = pd.to_numeric(summary_clean.loc[:,'crsp_portno'], downcast='integer')

In [16]:
print('Shape of summary data after cleaning: {:,} / {:,}'.format(summary_clean.shape[0],summary_clean.shape[1]))

Shape of summary data after cleaning: 81,176 / 8


# Match obj code to portfolios

TODO: 

Fund info flags like the index_fund_flag could in theory also be different in the fund_history

Therefore a similar approach should also be used for those items

In general portno fundno map also beginning and end times

In [22]:
def port_ID_to_port_info(fund_info):  
            """
            Used to merge the right obj_code and other fund info 
            to each holdings_info row and therefore to each row of the sparse holdings matrix
            
            Input:
            - fund_info: Tuple consisting of the port_no (1st element) and the report_dt (2nd)
            
            Output:
            - Tuple of port_no, report_dt, index_fund_flag, et_flag and crsp_obj_cd. 
                NaN if value is not available
            
            Attention:
            Depends on global variable 'summary' to look up the values
            Must be renamed or changed in the function
            """
            port_no = fund_info[0]
            report_dt = fund_info[1].date() # Implement in large csv file
            mask = summary['crsp_portno'].values == port_no
            my_class = summary.loc[mask]

            my_class_n = my_class.loc[
                (my_class.begdt <= report_dt) & 
                (my_class.enddt >= report_dt)]
            
            try:
                lipper_class = my_class_n['lipper_class'].values[0]
                index_fund_flag = my_class_n['index_fund_flag'].values[0]
                et_flag = my_class_n['et_flag'].values[0]

            except:
                lipper_class = np.nan
                index_fund_flag = np.nan
                et_flag = np.nan
                
            return(port_no, report_dt, index_fund_flag, et_flag, lipper_class)        

### Multiprocessing

In [23]:
a = holdings_summary['port_no']
b = holdings_summary['date']

fund_info = list(zip(a,b))

In [24]:
len(fund_info)

733743

In [25]:
%%time
with multiprocessing.Pool(processes=6) as pool:
    results = pool.map(port_ID_to_port_info, fund_info)

CPU times: user 2.56 s, sys: 473 ms, total: 3.03 s
Wall time: 5min 19s


In [111]:
labels = ['port_no','report_dt','index_fund_flag','et_flag','lipper_class']
holdings_summary = pd.DataFrame.from_records(results, columns=labels)

In [112]:
print('Out of the roughly 730k portfolios, for 129k there is no fund header info available')
print(holdings_summary.shape)
holdings_summary[holdings_summary['lipper_class'].isna()].shape # -> For some portfolios there is simly no row in fund_header

Out of the roughly 730k portfolios, for 129k there is no fund header info available
(733743, 5)


(129400, 5)

#### Save result to save time

In [49]:
path = '../data/interim/holdings_summary_raw.feather'
feather.write_dataframe(holdings_summary,path)

In [183]:
path = '../data/interim/holdings_summary_raw.feather'
holdings_summary = feather.read_dataframe(path)

# Clean holdings_summary data

Do not delet rows since they match to the sparse matrix

TODO: Replace all falgs with proper categories

In [184]:
# Creat new flag var 'mutual_fund' that is Y for Mutual Funds and N for other funds
# Based on variable et_flag and index_fund_flag

# Make the two flags categories and rename those categories accordingly
holdings_summary[['et_flag','index_fund_flag']] = holdings_summary[['et_flag','index_fund_flag']].astype('category')

et_mapper = {'F':'ETF', 'N':'ETN', np.nan:'MF'}
holdings_summary['et_flag'] = holdings_summary['et_flag'].map(et_mapper)

index_flag_mapper = {'B':'Index-based', 'D':'Pure Index', 'E':'Index enhanced', np.nan:'MF'}
holdings_summary['index_fund_flag'] = holdings_summary['index_fund_flag'].map(index_flag_mapper)

holdings_summary.loc[(holdings_summary['index_fund_flag'] == 'MF') & 
                     (holdings_summary['et_flag'] == 'MF'),'mutual_fund'] = 'Y'
holdings_summary.loc[holdings_summary['mutual_fund'].isna(),'mutual_fund'] = 'N'

holdings_summary.shape

(733743, 6)

# Choose which CRSP style codes to use

In [185]:
# Creat new flag var 'sample' that is Y for those included and N for those not included
# , 'EDCL', 'EDCM', 'EDCS', 'EDCI'
# selected_obj_codes = ('EDYG','EDYB' ,'EDYH' ,'EDYS' ,'EDYI')

selected_obj_codes = (#'EIEI', 'G', 
                      'LCCE', 'LCGE', 'LCVE',   # Large-cap
                      'MCCE', 'MCGE', 'MCVE',   # Mid-cap
#                      'MLCE', 'MLGE', 'MLVE',   # Multi-cap
                      'SCCE', 'SCGE', 'SCVE')   # Small-cap

In [186]:
# If fund is Mutual Fund and has the right style code -> Sample == 'Y'
holdings_summary.loc[(holdings_summary['mutual_fund'] == 'Y') & 
                     (holdings_summary['lipper_class'].isin(selected_obj_codes)),'sample'] = 'Y'

# Other wise sample == 'N'
holdings_summary.loc[holdings_summary['sample'].isna(),'sample'] = 'N'

# Make the two new variables categorical
holdings_summary[['mutual_fund','sample']] = holdings_summary[['mutual_fund','sample']].astype('category')

In [187]:
holdings_summary['cap_class'] = holdings_summary['lipper_class'].astype(str).str[0]
holdings_summary['style_class'] = holdings_summary['lipper_class'].astype(str).str[2]

In [188]:
holdings_summary.sample(10)

Unnamed: 0,port_no,report_dt,index_fund_flag,et_flag,lipper_class,mutual_fund,sample,cap_class,style_class
619078,1030013,2010-07-31,MF,MF,IMD,Y,N,I,D
502966,1027564,2011-12-31,MF,MF,IMLC,Y,N,I,L
674115,1031739,2012-07-31,MF,MF,HY,Y,N,H,
81647,1003306,2004-04-30,MF,MF,IF,Y,N,I,
730666,1033931,2016-12-31,Pure Index,ETF,EU,N,N,E,
450838,1026536,2013-01-31,MF,MF,BBB,Y,N,B,B
187035,1021516,2014-12-31,Index enhanced,ETF,DL,N,N,D,
684754,1032041,2012-09-30,MF,MF,SCGE,Y,Y,S,G
148445,1019557,2010-05-31,MF,MF,,Y,N,N,n
401764,1025616,2011-12-31,MF,MF,MCCE,Y,Y,M,C


### Results

In [189]:
print('How are the two variable on which MF assignemnt is based on distributed:')
pd.crosstab(holdings_summary['et_flag'],holdings_summary['index_fund_flag'])

How are the two variable on which MF assignemnt is based on distributed:


index_fund_flag,Index enhanced,Index-based,MF,Pure Index
et_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ETF,5772,10516,4065,77234
MF,5143,10401,599415,21197


In [190]:
print('How many funds are considered in the sample?')
print(holdings_summary['sample'].value_counts())

print('How are the selected styles distributed?')
pd.crosstab(holdings_summary['lipper_class'],holdings_summary['sample']).sort_values('Y')[-len(selected_obj_codes):]

How many funds are considered in the sample?
N    620812
Y    112931
Name: sample, dtype: int64
How are the selected styles distributed?


sample,N,Y
lipper_class,Unnamed: 1_level_1,Unnamed: 2_level_1
MCVE,821,4468
SCVE,1912,6774
MCCE,3763,7870
MCGE,1177,10628
LCVE,1991,11489
SCGE,1385,13316
SCCE,5112,15875
LCGE,2877,19663
LCCE,4373,22848


In [240]:
holdings_summary.head(1)

Unnamed: 0,port_no,report_dt,index_fund_flag,et_flag,lipper_class,mutual_fund,sample,cap_class,style_class,crsp_portno,fund_no
0,1000001,2003-03-31,MF,MF,,Y,N,N,n,1000001.0,4273.0


In [250]:
test_summary = holdings_summary[holdings_summary['lipper_class'].isin(selected_obj_codes)]
test_summary['et_flag'].value_counts()

MF     123609
ETF     12733
Name: et_flag, dtype: int64

# Save holdings_summary

In [191]:
path = '../data/interim/holdings_summary_total.feather'
feather.write_dataframe(holdings_summary,path)

In [192]:
path = '../data/interim/holdings_summary_total.feather'
holdings_summary = feather.read_dataframe(path)

### Add fund_no to holdings_summary

#### Fundo is not an integer for now but not that important -> TODO

In [193]:
portno_map_unique = portno_map.drop_duplicates(subset='crsp_portno')

#### Maybe must be modified since all but one associated fund_nos per portfolio are deleted 

In [194]:
holdings_summary = holdings_summary.merge(portno_map_unique[['crsp_portno','crsp_fundno']],how='left', left_on = 'port_no', right_on='crsp_portno')

In [195]:
holdings_summary.shape

(733743, 11)

In [196]:
mask = holdings_summary['crsp_fundno'].notna()
holdings_summary['crsp_fundno'] = holdings_summary.loc[mask,'crsp_fundno'].astype(int)

In [197]:
holdings_summary.head()

Unnamed: 0,port_no,report_dt,index_fund_flag,et_flag,lipper_class,mutual_fund,sample,cap_class,style_class,crsp_portno,crsp_fundno
0,1000001,2003-03-31,MF,MF,,Y,N,N,n,1000001.0,4273.0
1,1000001,2003-06-30,MF,MF,,Y,N,N,n,1000001.0,4273.0
2,1000001,2003-09-30,MF,MF,,Y,N,N,n,1000001.0,4273.0
3,1000001,2004-02-29,MF,MF,,Y,N,N,n,1000001.0,4273.0
4,1000001,2004-06-30,MF,MF,,Y,N,N,n,1000001.0,4273.0


new_order = [0,8,1,2,3,4,5,6]
holdings_summary = holdings_summary[holdings_summary.columns[new_order]]

In [198]:
holdings_summary = holdings_summary.rename(columns={'crsp_fundno':'fund_no'}, index=str)

# Take sample according to parameter

### Filter returns

In [199]:
unique_portno = holdings_summary[['fund_no']].drop_duplicates()

In [200]:
mask = returns['crsp_fundno'].isin(unique_portno['fund_no'])

In [201]:
returns_s = returns[mask]

In [202]:
print('Shape of returns before filtering',
     returns.shape)

print('Shape of returns after filtering ',
     returns_s.shape)

Shape of returns before filtering (7273320, 3)
Shape of returns after filtering  (2173160, 3)


# Filter holdings

### Mask to filter out only those in the sample according to holdings_summary

In [203]:
mask = (holdings_summary['sample'] == 'Y') 
np.sum(mask)

112931

In [204]:
mask = (holdings_summary['sample'] == 'Y') & (holdings_summary['port_no'].notna())
np.sum(mask)

112931

In [205]:
holdings_s = holdings[mask.values]
holdings_s

<112931x2382968 sparse matrix of type '<class 'numpy.float64'>'
	with 15217553 stored elements in Compressed Sparse Row format>

### Filter holdings summary

In [206]:
holdings_summary_s = holdings_summary[mask]

In [207]:
holdings_s.shape

(112931, 2382968)

In [208]:
holdings_summary_s.shape

(112931, 11)

## Take only last n number of obs per fund

To avoid overweight of funds with many observations

In [209]:
last_n = 5

In [210]:
holdings_summary_s = holdings_summary_s.reset_index()
index = pd.DataFrame(np.arange(holdings_summary_s.shape[0]))
index = index.groupby(holdings_summary_s['port_no']).tail(last_n)
index = index.values.T.flatten()

In [211]:
holdings_summary_s = holdings_summary_s.reset_index().loc[index,:]
holdings_s = holdings_s[index]

In [212]:
print('Observations left after taking only the last {} observations per fund:'.format(last_n))
holdings_s.shape

Observations left after taking only the last 5 observations per fund:


(15391, 2382968)

### Delet columns (stocks) with little to no information
Delet all colums / stocks which occur no more than 'min_observations_per_stock' times

In [213]:
min_observations_per_stock = 10

In [214]:
holdings_b = sparse.csr_matrix(holdings_s, copy=True)
holdings_b.data = np.ones(len(holdings_s.data))

col_sums = pd.DataFrame(holdings_b.sum(0)).values.flatten()

In [215]:
print('Total number of securities:           {:>10,d}'.format(len(col_sums)))
print('Total number of securities with >1:   {:>10,d}'.format(sum(col_sums > 1)))
print('Total number of securities with >10:  {:>10,d}'.format(sum(col_sums > 10)))

Total number of securities:            2,382,968
Total number of securities with >1:       22,737
Total number of securities with >10:       8,995


1.5m of the 2m securities have either zero or one entry 

In [216]:
# generate mask
mask = col_sums > min_observations_per_stock

In [217]:
holdings_s = holdings_s.tocsc()
holdings_s = holdings_s[:,mask]
holdings_s = holdings_s.tocsr()
holdings_s

<15391x8995 sparse matrix of type '<class 'numpy.float64'>'
	with 1850674 stored elements in Compressed Sparse Row format>

In [218]:
holdings_b = holdings_b.tocsc()
holdings_b = holdings_b[:,mask]
holdings_b = holdings_b.tocsr()
holdings_b

<15391x8995 sparse matrix of type '<class 'numpy.float64'>'
	with 1850674 stored elements in Compressed Sparse Row format>

### Save final cleaned and filtered data

#### Sparse matrix

In [219]:
path = '../data/processed/EDY/holdings_s'
sparse.save_npz(path, holdings_s)

In [220]:
path = '../data/processed/EDY/holdings_b'
sparse.save_npz(path, holdings_b)

#### Sparse info

In [221]:
path = '../data/processed/EDY/holdings_summary_s.feather'
feather.write_dataframe(holdings_summary_s,path)

#### Returns

In [222]:
path = '../data/processed/EDY/returns_s.feather'
feather.write_dataframe(returns_s,path)

## Take smaller sub_sub sample (Everything before specified year)
Makes processing faster

In [223]:
start_date = '2015-01-01'
start_date = datetime.datetime.strptime(start_date, '%Y-%M-%d').date()
end_date = '2018-01-01'
end_date = datetime.datetime.strptime(end_date, '%Y-%M-%d').date()

#### Holdings & holdings_summary

In [224]:
mask = (holdings_summary_s['report_dt'] > start_date) & (holdings_summary_s['report_dt'] < end_date)

In [225]:
holdings_s_s = holdings_s[mask.values]
holdings_b_b = holdings_b[mask.values]

In [226]:
holdings_summary_s_s = holdings_summary_s[mask]

#### Test

In [227]:
holdings_s_s.shape

(5057, 8995)

In [228]:
holdings_summary_s_s.shape

(5057, 13)

#### Returns

In [229]:
mask = (returns_s['caldt'] > start_date) & (returns_s['caldt'] < end_date)
returns_s_s = returns_s[mask]

### Save final cleaned and filtered data

#### Sparse matrix

In [230]:
path = '../data/processed/EDY/holdings_s_s'
sparse.save_npz(path, holdings_s_s)

path = '../data/processed/EDY/holdings_b_b'
sparse.save_npz(path, holdings_b_b)

#### Sparse info

In [231]:
path = '../data/processed/EDY/holdings_summary_s_s.feather'
feather.write_dataframe(holdings_summary_s_s,path)

#### Returns

In [232]:
path = '../data/processed/EDY/returns_s_s.feather'
feather.write_dataframe(returns_s_s,path)