## Description

Description:

Filter the three main data sources (Fund Info, Returns and Holdings)
based on some parameters and save the result.

Approach:
1. Match fund summary to each portfolio/date pair
2. Filter fund/date pairs based on those infos
3. Filter returns based on final sample of holdings

Parameters: 
- Obj_codes
- Percentage
- Flags

In [1]:
%matplotlib inline

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
from data.basic_functions import * 

import numpy as np
import pandas as pd

from scipy import sparse

import matplotlib.pyplot as plt

import datetime

# For multiprocessing
import multiprocessing
from itertools import product

### Load the data files

In [2]:
data_path = '../data/raw/'

returns = load_data(data_path,'monthly_returns')
summary = load_data(data_path,'total_summary_new') # Wo kommt das her ???
summary.shape

(167176, 8)

In [3]:
npz_path = '../data/interim/sparse_matrix_t.npz'
holdings = sparse.load_npz(npz_path)
holdings.shape

(733743, 2382968)

In [4]:
data_path = '../data/interim/'
holdings_summary = load_data(data_path,'sparse_info_t')
holdings_summary.shape

(733743, 2)

In [5]:
path = '../data/raw/portno_map.feather'
portno_map = feather.read_dataframe(path)

## Drop duplicates in summary data

To increase speed of matching obj_codes and other fund info to portfolios

In [6]:
print('Shape of summary before cleaning is: {:,} / {:,}'.format(summary.shape[0],summary.shape[1]))

Shape of summary before cleaning is: 167,176 / 8


In [7]:
# Delet not nas
summary_clean = summary[summary['crsp_portno'].notna()]

# Drop duplicates based on all but two columns
summary_clean = summary_clean.drop_duplicates(summary_clean.columns.difference(['crsp_fundno','fund_name']))

summary_clean.loc[:,'crsp_fundno'] = pd.to_numeric(summary_clean.loc[:,'crsp_fundno'], downcast='integer')
summary_clean.loc[:,'crsp_portno'] = pd.to_numeric(summary_clean.loc[:,'crsp_portno'], downcast='integer')

In [8]:
print('Shape of summary data after cleaning: {:,} / {:,}'.format(summary_clean.shape[0],summary_clean.shape[1]))

Shape of summary data after cleaning: 96,077 / 8


# Match obj code to portfolios

TODO: 

Fund info flags like the index_fund_flag could in theory also be different in the fund_history

Therefore a similar approach should also be used for those items

In general portno fundno map also beginning and end times

In [9]:
def port_ID_to_port_info(fund_info):  
            """
            Used to merge the right obj_code and other fund info 
            to each holdings_info row and therefore to each row of the sparse holdings matrix
            
            Input:
            - fund_info: Tuple consisting of the port_no (1st element) and the report_dt (2nd)
            
            Output:
            - Tuple of port_no, report_dt, index_fund_flag, et_flag and crsp_obj_cd. 
                NaN if value is not available
            
            Attention:
            Depends on global variable 'summary' to look up the values
            Must be renamed or changed in the function
            """
            port_no = fund_info[0]
            report_dt = fund_info[1].date() # Implement in large csv file
            mask = summary['crsp_portno'].values == port_no
            my_class = summary.loc[mask]

            my_class_n = my_class.loc[
                (my_class.begdt <= report_dt) & 
                (my_class.enddt >= report_dt)]
            
            try:
                crsp_obj_cd = my_class_n['crsp_obj_cd'].values[0]
                index_fund_flag = my_class_n['index_fund_flag'].values[0]
                et_flag = my_class_n['et_flag'].values[0]

            except:
                crsp_obj_cd = np.nan
                index_fund_flag = np.nan
                et_flag = np.nan
                
            return(port_no, report_dt, index_fund_flag, et_flag, crsp_obj_cd)        

### Multiprocessing

In [10]:
a = holdings_summary['port_no']
b = holdings_summary['date']

fund_info = list(zip(a,b))

In [11]:
len(fund_info)

733743

%%time
with multiprocessing.Pool(processes=6) as pool:
    results = pool.map(port_ID_to_port_info, fund_info)

In [82]:
labels = ['port_no','report_dt','index_fund_flag','et_flag','crsp_obj_cd']
holdings_summary = pd.DataFrame.from_records(results, columns=labels)

In [83]:
print('Out of the roughly 730k portfolios, for 129k there is no fund header info available')
print(holdings_summary.shape)
holdings_summary[holdings_summary['crsp_obj_cd'].isna()].shape # -> For some portfolios there is simly no row in fund_header

Out of the roughly 730k portfolios, for 129k there is no fund header info available
(733743, 5)


(129400, 5)

#### Save result to save time

path = '../data/interim/holdings_summary_raw.feather'
feather.write_dataframe(holdings_summary,path)

In [256]:
path = '../data/interim/holdings_summary_raw.feather'
holdings_summary = feather.read_dataframe(path)

# Clean holdings_summary data

Do not delet rows since they match to the sparse matrix

TODO: Replace all falgs with proper categories

In [257]:
# Creat new flag var 'mutual_fund' that is Y for Mutual Funds and N for other funds
# Based on variable et_flag and index_fund_flag

# Make the two flags categories and rename those categories accordingly
holdings_summary[['et_flag','index_fund_flag']] = holdings_summary[['et_flag','index_fund_flag']].astype('category')

et_mapper = {'F':'ETF', 'N':'ETN', np.nan:'MF'}
holdings_summary['et_flag'] = holdings_summary['et_flag'].map(et_mapper)

index_flag_mapper = {'B':'Index-based', 'D':'Pure Index', 'E':'Index enhanced', np.nan:'MF'}
holdings_summary['index_fund_flag'] = holdings_summary['index_fund_flag'].map(index_flag_mapper)

holdings_summary.loc[(holdings_summary['index_fund_flag'] == 'MF') & 
                     (holdings_summary['et_flag'] == 'MF'),'mutual_fund'] = 'Y'
holdings_summary.loc[holdings_summary['mutual_fund'].isna(),'mutual_fund'] = 'N'

holdings_summary.shape

(733743, 6)

# Choose which CRSP style codes to use

In [258]:
# Creat new flag var 'sample' that is Y for those included and N for those not included
# , 'EDCL', 'EDCM', 'EDCS', 'EDCI'
# selected_obj_codes = ('EDYG','EDYB' ,'EDYH' ,'EDYS' ,'EDYI')

selected_obj_codes = ('EDYG','EDYH', 'EDYI')

In [259]:
# If fund is Mutual Fund and has the right style code -> Sample == 'Y'
holdings_summary.loc[(holdings_summary['mutual_fund'] == 'Y') & 
                     (holdings_summary['crsp_obj_cd'].isin(selected_obj_codes)),'sample'] = 'Y'

# Other wise sample == 'N'
holdings_summary.loc[holdings_summary['sample'].isna(),'sample'] = 'N'

# Make the two new variables categorical
holdings_summary[['mutual_fund','sample']] = holdings_summary[['mutual_fund','sample']].astype('category')

### Results

In [260]:
print('How are the two variable on which MF assignemnt is based on distributed:')
pd.crosstab(holdings_summary['et_flag'],holdings_summary['index_fund_flag'])

How are the two variable on which MF assignemnt is based on distributed:


index_fund_flag,Index enhanced,Index-based,MF,Pure Index
et_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ETF,5772,10996,4065,76754
MF,5100,11586,599052,20418


In [261]:
print('How many funds are considered in the sample?')
print(holdings_summary['sample'].value_counts())

print('How are the selected styles distributed?')
pd.crosstab(holdings_summary['crsp_obj_cd'],holdings_summary['sample']).sort_values('Y')[-len(selected_obj_codes):]

How many funds are considered in the sample?
N    641465
Y     92278
Name: sample, dtype: int64
How are the selected styles distributed?


sample,N,Y
crsp_obj_cd,Unnamed: 1_level_1,Unnamed: 2_level_1
EDYH,7809,6941
EDYI,2232,9269
EDYG,16193,76068


# Save holdings_summary

In [262]:
path = '../data/interim/holdings_summary_total.feather'
feather.write_dataframe(holdings_summary,path)

In [263]:
path = '../data/interim/holdings_summary_total.feather'
holdings_summary = feather.read_dataframe(path)

### Add fund_no to holdings_summary

#### Fundo is not an integer for now but not that important -> TODO

In [264]:
portno_map_unique = portno_map.drop_duplicates(subset='crsp_portno')

#### Maybe must be modified since all but one associated fund_nos per portfolio are deleted 

In [265]:
holdings_summary = holdings_summary.merge(portno_map_unique[['crsp_portno','crsp_fundno']],how='left', left_on = 'port_no', right_on='crsp_portno')

In [266]:
holdings_summary.shape

(733743, 9)

In [267]:
mask = holdings_summary['crsp_fundno'].notna()
holdings_summary['crsp_fundno'] = holdings_summary.loc[mask,'crsp_fundno'].astype(int)

In [268]:
new_order = [0,8,1,2,3,4,5,6]
holdings_summary = holdings_summary[holdings_summary.columns[new_order]]

In [269]:
holdings_summary = holdings_summary.rename(columns={'crsp_fundno':'fund_no'}, index=str)

# Take sample according to parameter

### Filter returns

In [270]:
unique_portno = holdings_summary[['fund_no']].drop_duplicates()

In [271]:
mask = returns['crsp_fundno'].isin(unique_portno['fund_no'])

In [272]:
returns_s = returns[mask]

In [273]:
print('Shape of returns before filtering',
     returns.shape)

print('Shape of returns after filtering ',
     returns_s.shape)

Shape of returns before filtering (7273320, 3)
Shape of returns after filtering  (2173160, 3)


# Filter holdings

### Mask to filter out only those in the sample according to holdings_summary

In [274]:
mask = (holdings_summary['sample'] == 'Y') 
np.sum(mask)

92278

In [275]:
mask = (holdings_summary['sample'] == 'Y') & (holdings_summary['port_no'].notna())
np.sum(mask)

92278

In [276]:
holdings_s = holdings[mask.values]
holdings_s

<92278x2382968 sparse matrix of type '<class 'numpy.float64'>'
	with 10668909 stored elements in Compressed Sparse Row format>

### Filter holdings summary

In [277]:
holdings_summary_s = holdings_summary[mask]

In [278]:
holdings_s.shape

(92278, 2382968)

In [279]:
holdings_summary_s.shape

(92278, 8)

## Take only last n number of obs per fund

To avoid overweight of funds with many observations

In [280]:
last_n = 5

In [281]:
holdings_summary_s = holdings_summary_s.reset_index()
index = pd.DataFrame(np.arange(holdings_summary_s.shape[0]))
index = index.groupby(holdings_summary_s['port_no']).tail(last_n)
index = index.values.T.flatten()

In [282]:
holdings_summary_s = holdings_summary_s.reset_index().loc[index,:]
holdings_s = holdings_s[index]

In [283]:
print('Observations left after taking only the last {} observations per fund:'.format(last_n))
holdings_s.shape

Observations left after taking only the last 5 observations per fund:


(12724, 2382968)

### Delet columns (stocks) with little to no information
Delet all colums / stocks which occur no more than 'min_observations_per_stock' times

In [284]:
min_observations_per_stock = 10

In [285]:
holdings_b = sparse.csr_matrix(holdings_s, copy=True)
holdings_b.data = np.ones(len(holdings_s.data))

col_sums = pd.DataFrame(holdings_b.sum(0)).values.flatten()

In [286]:
print('Total number of securities:           {:>10,d}'.format(len(col_sums)))
print('Total number of securities with >1:   {:>10,d}'.format(sum(col_sums > 1)))
print('Total number of securities with >10:  {:>10,d}'.format(sum(col_sums > 10)))

Total number of securities:            2,382,968
Total number of securities with >1:       45,980
Total number of securities with >10:      10,241


1.5m of the 2m securities have either zero or one entry 

In [287]:
# generate mask
mask = col_sums > min_observations_per_stock

In [288]:
holdings_s = holdings_s.tocsc()
holdings_s = holdings_s[:,mask]
holdings_s = holdings_s.tocsr()
holdings_s

<12724x10241 sparse matrix of type '<class 'numpy.float64'>'
	with 1280643 stored elements in Compressed Sparse Row format>

In [289]:
holdings_b = holdings_b.tocsc()
holdings_b = holdings_b[:,mask]
holdings_b = holdings_b.tocsr()
holdings_b

<12724x10241 sparse matrix of type '<class 'numpy.float64'>'
	with 1280643 stored elements in Compressed Sparse Row format>

### Save final cleaned and filtered data

#### Sparse matrix

In [290]:
path = '../data/processed/EDY/holdings_s'
sparse.save_npz(path, holdings_s)

In [291]:
path = '../data/processed/EDY/holdings_b'
sparse.save_npz(path, holdings_b)

#### Sparse info

In [292]:
path = '../data/processed/EDY/holdings_summary_s.feather'
feather.write_dataframe(holdings_summary_s,path)

#### Returns

In [293]:
path = '../data/processed/EDY/returns_s.feather'
feather.write_dataframe(returns_s,path)

## Take smaller sub_sub sample (Everything before specified year)
Makes processing faster

In [294]:
start_date = '2015-01-01'
start_date = datetime.datetime.strptime(start_date, '%Y-%M-%d').date()
end_date = '2018-01-01'
end_date = datetime.datetime.strptime(end_date, '%Y-%M-%d').date()

#### Holdings & holdings_summary

In [295]:
mask = (holdings_summary_s['report_dt'] > start_date) & (holdings_summary_s['report_dt'] < end_date)

In [296]:
holdings_s_s = holdings_s[mask.values]
holdings_b_b = holdings_b[mask.values]

In [297]:
holdings_summary_s_s = holdings_summary_s[mask]

#### Test

In [298]:
holdings_s_s.shape

(4346, 10241)

In [299]:
holdings_summary_s_s.shape

(4346, 10)

#### Returns

In [300]:
mask = (returns_s['caldt'] > start_date) & (returns_s['caldt'] < end_date)
returns_s_s = returns_s[mask]

### Save final cleaned and filtered data

#### Sparse matrix

In [301]:
path = '../data/processed/EDY/holdings_s_s'
sparse.save_npz(path, holdings_s_s)

path = '../data/processed/EDY/holdings_b_b'
sparse.save_npz(path, holdings_b_b)

#### Sparse info

In [302]:
path = '../data/processed/EDY/holdings_summary_s_s.feather'
feather.write_dataframe(holdings_summary_s_s,path)

#### Returns

In [303]:
path = '../data/processed/EDY/returns_s_s.feather'
feather.write_dataframe(returns_s_s,path)