## Description

Description:

Filter the three main data sources (Fund Info, Returns and Holdings)
based on some parameters and save the result.

Approach:
1. Match fund summary to each portfolio/date pair
2. Filter fund/date pairs based on those infos
3. Filter returns based on final sample of holdings

Parameters: 
- Obj_codes
- Percentage
- Flags

In [2]:
%matplotlib inline

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
from data.basic_functions import * 

import numpy as np
import pandas as pd

from scipy import sparse

import matplotlib.pyplot as plt

import datetime

# For multiprocessing
import multiprocessing
from itertools import product

### Load the data files

In [3]:
data_path = '../data/raw/'

returns = load_data(data_path,'monthly_returns') 
summary = load_data(data_path,'total_summary_new') # Wo kommt das her ???
summary.shape

(167842, 9)

In [4]:
npz_path = '../data/interim/sparse_matrix_t.npz'
holdings = sparse.load_npz(npz_path)
holdings.shape

(733743, 2382968)

In [5]:
data_path = '../data/interim/'
holdings_summary = load_data(data_path,'sparse_info_t')
holdings_summary.shape

(733743, 2)

In [6]:
path = '../data/raw/portno_map.feather'
portno_map = feather.read_dataframe(path)

## Drop duplicates in summary data

To increase speed of matching obj_codes and other fund info to portfolios

In [7]:
print('Shape of summary before cleaning is: {:,} / {:,}'.format(summary.shape[0],summary.shape[1]))

Shape of summary before cleaning is: 167,842 / 9


In [9]:
# Delet observations witrh missing portnos
summary_clean = summary[summary['crsp_portno'].notna()]

# Drop duplicates based on all but two columns
# summary_clean = summary_clean.drop_duplicates(summary_clean.columns.difference(['crsp_fundno','fund_name']))

summary_clean.loc[:,'crsp_fundno'] = pd.to_numeric(summary_clean.loc[:,'crsp_fundno'], downcast='integer')
summary_clean.loc[:,'crsp_portno'] = pd.to_numeric(summary_clean.loc[:,'crsp_portno'], downcast='integer')

In [10]:
print('Shape of summary data after cleaning: {:,} / {:,}'.format(summary_clean.shape[0],summary_clean.shape[1]))

Shape of summary data after cleaning: 132,646 / 9


# Match obj code to portfolios

TODO: 

Fund info flags like the index_fund_flag could in theory also be different in the fund_history

Therefore a similar approach should also be used for those items

In general portno fundno map also beginning and end times

In [12]:
def port_ID_to_port_info(fund_info):  
            """
            Used to merge the right obj_code and other fund info 
            to each holdings_info row and therefore to each row of the sparse holdings matrix
            
            Input:
            - fund_info: Tuple consisting of the port_no (1st element) and the report_dt (2nd)
            
            Output:
            - Tuple of port_no, report_dt, index_fund_flag, et_flag and crsp_obj_cd. 
                NaN if value is not available
            
            Attention:
            Depends on global variable 'summary' to look up the values
            Must be renamed or changed in the function
            """
            port_no = fund_info[0]
            report_dt = fund_info[1].date() # Implement in large csv file
            mask = summary['crsp_portno'].values == port_no
            my_class = summary.loc[mask]

            my_class_n = my_class.loc[
                (my_class.begdt <= report_dt) & 
                (my_class.enddt >= report_dt)]
            
            try:
                lipper_class = my_class_n['lipper_class'].values[0]
                index_fund_flag = my_class_n['index_fund_flag'].values[0]
                et_flag = my_class_n['et_flag'].values[0]

            except:
                lipper_class = np.nan
                index_fund_flag = np.nan
                et_flag = np.nan
                
            return(port_no, report_dt, index_fund_flag, et_flag, lipper_class)        

### Multiprocessing

In [13]:
a = holdings_summary['port_no']
b = holdings_summary['date']

fund_info = list(zip(a,b))

In [14]:
len(fund_info)

733743

In [15]:
with multiprocessing.Pool(processes=6) as pool:
    results = pool.map(port_ID_to_port_info, fund_info)

In [16]:
labels = ['port_no','report_dt','index_fund_flag','et_flag','lipper_class']
holdings_summary = pd.DataFrame.from_records(results, columns=labels)

In [17]:
print('Out of the roughly 730k portfolios, for 129k there is no fund header info available')
print(holdings_summary.shape)
holdings_summary[holdings_summary['lipper_class'].isna()].shape # -> For some portfolios there is simly no row in fund_header

Out of the roughly 730k portfolios, for 129k there is no fund header info available
(733743, 5)


(129400, 5)

#### Save result to save time

In [18]:
path = '../data/interim/holdings_summary_raw.feather'
feather.write_dataframe(holdings_summary,path)

In [19]:
path = '../data/interim/holdings_summary_raw.feather'
holdings_summary = feather.read_dataframe(path)

# Clean holdings_summary data

Do not delet rows since they match to the sparse matrix

In [20]:
# Creat new flag var 'mutual_fund' that is Y for Mutual Funds and N for other funds
# Based on variable et_flag and index_fund_flag

# Make the two flags categories and rename those categories accordingly
holdings_summary[['et_flag','index_fund_flag']] = holdings_summary[['et_flag','index_fund_flag']].astype('category')

et_mapper = {'F':'ETF', 'N':'ETN', np.nan:'MF'}
holdings_summary['et_flag'] = holdings_summary['et_flag'].map(et_mapper)

index_flag_mapper = {'B':'Index-based', 'D':'Pure Index', 'E':'Index enhanced', np.nan:'MF'}
holdings_summary['index_fund_flag'] = holdings_summary['index_fund_flag'].map(index_flag_mapper)

holdings_summary.loc[(holdings_summary['index_fund_flag'] == 'MF') & 
                     (holdings_summary['et_flag'] == 'MF'),'mutual_fund'] = 'Y'
holdings_summary.loc[holdings_summary['mutual_fund'].isna(),'mutual_fund'] = 'N'

holdings_summary.shape

(733743, 6)

# Choose which CRSP style codes to use

In [21]:
# Creat new flag var 'sample' that is Y for those included and N for those not included
# , 'EDCL', 'EDCM', 'EDCS', 'EDCI'
# selected_obj_codes = ('EDYG','EDYB' ,'EDYH' ,'EDYS' ,'EDYI')

selected_obj_codes = ('EIEI', 'G', 
                      'LCCE', 'LCGE', 'LCVE',   # Large-cap
                      'MCCE', 'MCGE', 'MCVE',   # Mid-cap
                      'MLCE', 'MLGE', 'MLVE',   # Multi-cap
                      'SCCE', 'SCGE', 'SCVE')   # Small-cap

In [22]:
# If fund is Mutual Fund and has the right style code -> Sample == 'Y'
holdings_summary.loc[(holdings_summary['mutual_fund'] == 'Y') & 
                     (holdings_summary['lipper_class'].isin(selected_obj_codes)),'sample'] = 'Y'

# Otherwise sample == 'N'
holdings_summary.loc[holdings_summary['sample'].isna(),'sample'] = 'N'

# Make the two new variables categorical
holdings_summary[['mutual_fund','sample']] = holdings_summary[['mutual_fund','sample']].astype('category')

In [23]:
holdings_summary['cap_class'] = holdings_summary['lipper_class'].astype(str).str[0]
holdings_summary['style_class'] = holdings_summary['lipper_class'].astype(str).str[2]

### Results

In [25]:
print('How are the two variables on which MF assignemnt is based on distributed:')
pd.crosstab(holdings_summary['et_flag'],holdings_summary['index_fund_flag'])

How are the two variables on which MF assignemnt is based on distributed:


index_fund_flag,Index enhanced,Index-based,MF,Pure Index
et_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ETF,5772,10516,4065,77234
MF,5143,10401,599415,21197


In [26]:
print('How many funds are considered in the sample?')
print(holdings_summary['sample'].value_counts())

print('How are the selected styles distributed?')
pd.crosstab(holdings_summary['lipper_class'],holdings_summary['sample']).sort_values('Y')[-len(selected_obj_codes):]

How many funds are considered in the sample?
N    573773
Y    159970
Name: sample, dtype: int64
How are the selected styles distributed?


sample,N,Y
lipper_class,Unnamed: 1_level_1,Unnamed: 2_level_1
FX,8577,0
MCVE,821,4468
SCVE,1912,6774
MLVE,1524,7312
MCCE,3763,7870
EIEI,2044,8609
MCGE,1177,10628
LCVE,1991,11489
MLGE,2245,12603
SCGE,1385,13316


# Save holdings_summary

In [29]:
path = '../data/interim/holdings_summary_total.feather'
feather.write_dataframe(holdings_summary,path)

In [30]:
path = '../data/interim/holdings_summary_total.feather'
holdings_summary = feather.read_dataframe(path)

### Add fund_no to holdings_summary

#### Fundo is not an integer for now but not that important -> TODO

In [31]:
portno_map_unique = portno_map.drop_duplicates(subset='crsp_portno')

#### Maybe must be modified since all but one associated fund_nos per portfolio are deleted 

In [32]:
holdings_summary = holdings_summary.merge(portno_map_unique[['crsp_portno','crsp_fundno']],how='left', left_on = 'port_no', right_on='crsp_portno')

In [33]:
holdings_summary.shape

(733743, 11)

In [34]:
mask = holdings_summary['crsp_fundno'].notna()
holdings_summary['crsp_fundno'] = holdings_summary.loc[mask,'crsp_fundno'].astype(int)

In [35]:
holdings_summary.head()

Unnamed: 0,port_no,report_dt,index_fund_flag,et_flag,lipper_class,mutual_fund,sample,cap_class,style_class,crsp_portno,crsp_fundno
0,1000001,2003-03-31,MF,MF,,Y,N,N,n,1000001.0,4273.0
1,1000001,2003-06-30,MF,MF,,Y,N,N,n,1000001.0,4273.0
2,1000001,2003-09-30,MF,MF,,Y,N,N,n,1000001.0,4273.0
3,1000001,2004-02-29,MF,MF,,Y,N,N,n,1000001.0,4273.0
4,1000001,2004-06-30,MF,MF,,Y,N,N,n,1000001.0,4273.0


In [36]:
holdings_summary = holdings_summary.rename(columns={'crsp_fundno':'fund_no'}, index=str)

# Take sample according to parameter

### Filter returns

In [37]:
unique_portno = holdings_summary[['fund_no']].drop_duplicates()

In [38]:
mask = returns['crsp_fundno'].isin(unique_portno['fund_no'])

In [39]:
returns_s = returns[mask]

In [40]:
print('Shape of returns before filtering',
     returns.shape)

print('Shape of returns after filtering ',
     returns_s.shape)

Shape of returns before filtering (7273320, 3)
Shape of returns after filtering  (2173160, 3)


# Filter holdings

### Mask to filter out only those in the sample according to holdings_summary

In [41]:
mask = (holdings_summary['sample'] == 'Y') 
np.sum(mask)

159970

In [43]:
holdings_s = holdings[mask.values]
holdings_s

<159970x2382968 sparse matrix of type '<class 'numpy.float64'>'
	with 20855806 stored elements in Compressed Sparse Row format>

### Filter holdings summary

In [44]:
holdings_summary_s = holdings_summary[mask]

In [45]:
holdings_s.shape

(159970, 2382968)

In [78]:
print('Number of stocks with more than 1 observation')
np.sum(pd.DataFrame(holdings_s.sum(0).T) > 1)

Number of stocks with more than 1 observation


0    38668
dtype: int64

In [84]:
holdings_s.nonzero()
holdings_s[0,600]

0.7225008010864258

### Take only last n number of obs per fund (Not needed at the moment)

To avoid overweight of funds with many observations

In [None]:
last_n = 5

In [None]:
holdings_summary_s = holdings_summary_s.reset_index()
index = pd.DataFrame(np.arange(holdings_summary_s.shape[0]))
index = index.groupby(holdings_summary_s['port_no']).tail(last_n)
index = index.values.T.flatten()

In [None]:
holdings_summary_s = holdings_summary_s.reset_index().loc[index,:]
holdings_s = holdings_s[index]

In [None]:
print('Observations left after taking only the last {} observations per fund:'.format(last_n))
holdings_s.shape

### Delet columns (stocks) with little to no information
Delet all colums / stocks which occur no more than 'min_observations_per_stock' times

In [107]:
min_observations_per_stock = 10

In [108]:
holdings_b = sparse.csr_matrix(holdings_s, copy=True)
holdings_b.data = np.ones(len(holdings_s.data))

col_sums = pd.DataFrame(holdings_b.sum(0)).values.flatten()

In [109]:
print('Total number of securities:           {:>10,d}'.format(len(col_sums)))
print('Total number of securities with >1:   {:>10,d}'.format(sum(col_sums > 1)))
print('Total number of securities with >10:  {:>10,d}'.format(sum(col_sums > 10)))

Total number of securities:               28,210
Total number of securities with >1:       28,210
Total number of securities with >10:      28,210


1.5m of the 2m securities have either zero or one entry 

In [None]:
# generate mask
mask = col_sums > min_observations_per_stock

In [None]:
holdings_s = holdings_s.tocsc()
holdings_s = holdings_s[:,mask]
holdings_s = holdings_s.tocsr()
holdings_s

In [None]:
holdings_b = holdings_b.tocsc()
holdings_b = holdings_b[:,mask]
holdings_b = holdings_b.tocsr()
holdings_b

### Save final cleaned and filtered data

#### Sparse matrix

In [111]:
path = '../data/processed/holdings_s'
sparse.save_npz(path, holdings_s)

In [112]:
path = '../data/processed/holdings_b'
sparse.save_npz(path, holdings_b)

#### Sparse info

In [None]:
holdings_summary_s = holdings_summary_s.drop(columns=['index_fund_flag','et_flag','mutual_fund','sample','crsp_portno'])

In [136]:
holdings_summary_s['report_dt'] = pd.to_datetime(holdings_summary_s['report_dt'], format='%Y-%m-%d')

In [137]:
path = '../data/processed/holdings_summary_s.feather'
feather.write_dataframe(holdings_summary_s,path)

#### Returns

In [120]:
# Convert to date format and filter based on date
begin_date = '2003-01-01' 


date =  pd.to_datetime(returns['caldt'], format='%Y-%m-%d')
returns_s = returns_s[date > begin_date]

  


Unnamed: 0,crsp_fundno,caldt,mret
8666,105.0,2003-01-31,-0.027933
8667,105.0,2003-02-28,-0.022989
8668,105.0,2003-03-31,0.001471
8669,105.0,2003-04-30,0.107195
8670,105.0,2003-05-30,0.083554
8671,105.0,2003-06-30,0.036720
8672,105.0,2003-07-31,0.038961
8673,105.0,2003-08-29,0.035227
8674,105.0,2003-09-30,-0.005488
8675,105.0,2003-10-31,0.073951


In [121]:
path = '../data/processed/returns_s.feather'
feather.write_dataframe(returns_s,path)

## Take smaller sub_sub sample (Everything before specified year)
Makes processing faster

In [122]:
start_date = '2015-01-01'
start_date = datetime.datetime.strptime(start_date, '%Y-%M-%d').date()
end_date = '2018-01-01'
end_date = datetime.datetime.strptime(end_date, '%Y-%M-%d').date()

#### Holdings & holdings_summary

In [123]:
mask = (holdings_summary_s['report_dt'] > start_date) & (holdings_summary_s['report_dt'] < end_date)

In [124]:
holdings_s_s = holdings_s[mask.values]
holdings_b_b = holdings_b[mask.values]

In [125]:
holdings_summary_s_s = holdings_summary_s[mask]

#### Test

In [126]:
holdings_s_s.shape

(51348, 28210)

In [127]:
holdings_summary_s_s.shape

(51348, 6)

#### Returns

In [128]:
mask = (returns_s['caldt'] > start_date) & (returns_s['caldt'] < end_date)
returns_s_s = returns_s[mask]

### Save final cleaned and filtered data

#### Sparse matrix

In [132]:
path = '../data/processed/holdings_s_s'
sparse.save_npz(path, holdings_s_s)

path = '../data/processed/holdings_b_b'
sparse.save_npz(path, holdings_b_b)

#### Sparse info

In [133]:
path = '../data/processed/holdings_summary_s_s.feather'
feather.write_dataframe(holdings_summary_s_s,path)

#### Returns

In [134]:
path = '../data/processed/returns_s_s.feather'
feather.write_dataframe(returns_s_s,path)