# Clean Data

### Description:

Filter the three main data sources (Fund Info, Returns and Holdings)
based on some parameters and save the result.

Approach:
1. Match fund summary to each portfolio/date pair
2. Filter fund/date pairs based on those infos
3. Filter returns based on final sample of holdings
4. Save different versions of the files with different timeframes

Parameters: 
- Lipper_class
- Flags

## Setup
### All options for filtering the data

In [37]:
%matplotlib inline

import os
import sys

import feather
import pandasql as ps
import numpy as np
import pandas as pd
from scipy import sparse
import datetime

import matplotlib.pyplot as plt

In [38]:
options = {
    'sub_sample_start_date'      : datetime.date.fromisoformat('2015-01-01'),
    'sub_sample_end_date'        : datetime.date.fromisoformat('2018-01-01'),
    'min_per_com'                : 80,
    'min_tna_latest'             : 10,
    'min_observations_per_stock' : 50,
    'selected_obj_codes'         : ('EIEI', 'G', 
                                    'LCCE', 'LCGE', 'LCVE',   # Large-cap
                                    'MCCE', 'MCGE', 'MCVE',   # Mid-cap
                                    'MLCE', 'MLGE', 'MLVE',   # Multi-cap
                                    'SCCE', 'SCGE', 'SCVE')   # Small-cap
}




### Load the data files

In [39]:
# Holdings data

npz_path = '../data/interim/sparse_matrix.npz'      # Preprocessed holdings data (see notebook 011)
holdings = sparse.load_npz(npz_path)

path = '../data/interim/row_info.feather'
row_info = feather.read_dataframe(path)

path = '../data/interim/col_info.feather'
col_info = feather.read_dataframe(path)

# Fund summary data

path = '../data/raw/total_summary.feather'
fund_info = feather.read_dataframe(path)     # Downloaded from wrds

path = '../data/raw/fund_summay.feather'
fund_summary = feather.read_dataframe(path)     # Downloaded from wrds

path = '../data/raw/portno_map.feather'
portno_map = feather.read_dataframe(path)

path = '../data/raw/fund_style.feather'
fund_style = feather.read_dataframe(path)

# Returns data
path = '../data/raw/monthly_returns.feather'
returns = feather.read_dataframe(path)   # Downloaded from wrds

## Fix all data types

#### Fund info

In [40]:
fund_info.dtypes

crsp_fundno        float64
crsp_portno        float64
fund_name           object
first_offer_dt      object
index_fund_flag     object
et_flag             object
begdt               object
enddt               object
lipper_class        object
avrcs              float64
dtype: object

In [41]:
fund_info.drop(columns= ['lipper_class','avrcs'],inplace=True)

In [42]:
fund_info.dropna(subset = ['crsp_portno','crsp_fundno'], inplace=True)
fund_info[['crsp_fundno','crsp_portno']] = pd.DataFrame(fund_info[['crsp_fundno','crsp_portno']],dtype=np.int64)

columns = ['first_offer_dt','begdt','enddt']
fund_info[columns] = fund_info[columns].apply(pd.to_datetime)

In [43]:
fund_info.loc[fund_info['et_flag'].isna(),'et_flag'] = 'MF'
fund_info.loc[fund_info['index_fund_flag'].isna(),'index_fund_flag'] = 'MF'

In [44]:
columns = ['et_flag','index_fund_flag']
fund_info[columns] = fund_info[columns].astype('category')

#fund_info[fund_info['style'] == 'EIEI'] = 'LCVE'
#fund_info[fund_info['style'] == 'G'] = 'LCGE'

In [45]:
et_mapper = {'F':'ETF', 'N':'ETN', 'MF':'MF'}
fund_info['et_flag'] = fund_info['et_flag'].cat.rename_categories(et_mapper)

index_flag_mapper = {'B':'Index-based', 'D':'Pure Index', 'E':'Index enhanced', 'MF':'MF'}
fund_info['index_fund_flag'] = fund_info['index_fund_flag'].map(index_flag_mapper)

#### Fund summary

In [46]:
fund_summary[['crsp_fundno']] = pd.DataFrame(fund_summary[['crsp_fundno']],dtype=np.int64)
columns = ['nav_latest','tna_latest','per_com']
fund_summary[columns] = pd.DataFrame(fund_summary[columns],dtype=np.float64)

columns = ['caldt','nav_latest_dt','tna_latest_dt']
fund_summary[columns] = fund_summary[columns].apply(pd.to_datetime)

#### Fund style

In [47]:
fund_style['crsp_fundno'] = fund_style['crsp_fundno'].astype('int64')

columns = ['begdt','enddt']
fund_style[columns] = fund_style[columns].apply(pd.to_datetime)

fund_style['cap_class'] = fund_style['lipper_class'].astype(str).str[0]
fund_style['style_class'] = fund_style['lipper_class'].astype(str).str[2]
fund_style.loc[
    ~fund_style['lipper_class'].isin(options.get('selected_obj_codes')),
    ['cap_class','style_class']] = np.NaN

columns = ['lipper_class','lipper_obj_cd','cap_class','style_class']
fund_style[columns] = fund_style[columns].astype('category')

#### Return

In [48]:
returns['caldt'] =  pd.to_datetime(returns['caldt'])

#### Portnomap

In [49]:
columns = ['crsp_fundno','crsp_portno']
portno_map[columns] = pd.DataFrame(portno_map[columns],dtype=np.int64)

columns = ['begdt','enddt']
portno_map[columns] = portno_map[columns].apply(pd.to_datetime)

### Filter based on fund summary

In [73]:
fund_summary.sample()

Unnamed: 0,crsp_fundno,caldt,summary_period,nav_latest,nav_latest_dt,tna_latest,tna_latest_dt,per_com
1325789,41762,2010-09-30,Q,14.44,2010-09-30,0.2,2010-09-30,52.58


In [122]:
sqlcode = '''
    SELECT fund_summary.caldt, fund_summary.crsp_fundno, fund_summary.tna_latest, fund_summary.per_com,
    portno_map.begdt, portno_map.enddt, portno_map.crsp_portno, portno_map.crsp_fundno
    FROM fund_summary
    INNER JOIN portno_map 
    ON fund_summary.crsp_fundno = portno_map.crsp_fundno
    AND caldt BETWEEN portno_map.begdt AND portno_map.enddt;
'''

fund_summary_merged = ps.sqldf(sqlcode,locals())

columns = ['caldt','begdt','enddt']
fund_summary_merged[columns] = fund_summary_merged[columns].apply(pd.to_datetime)

fund_summary_merged = (fund_summary_merged
                           .groupby(['caldt','crsp_portno'])
                           .agg({'per_com': ['mean'],
                                 'tna_latest': ['sum']}))

fund_summary_merged.columns = fund_summary_merged.columns.droplevel(1)

fund_summary_merged = fund_summary_merged.query(
                            'per_com > {} and tna_latest > {}'.format(options.get('min_per_com'),
                                                                      options.get('min_tna_latest')))

### Filter based on fund info

In [16]:
fund_info = fund_info.query('''index_fund_flag == 'MF' and et_flag == 'MF' ''')

### Filter based on fund style

In [127]:
fund_style = fund_style[fund_style['lipper_class'].isin(options.get('selected_obj_codes'))]

### Filter based on holdings

In [18]:
# TODO

## Merge

### Merge Fund style and fund hdr

In [131]:
fund_info.sample()

Unnamed: 0,crsp_fundno,crsp_portno,fund_name,first_offer_dt,index_fund_flag,et_flag,begdt,enddt
134046,48084,1029231,American Funds Insurance Series: International...,2008-11-18,MF,MF,2016-10-14,2019-03-31


In [None]:
sqlcode = '''
    SELECT fund_summary.caldt, fund_summary.crsp_fundno, fund_summary.tna_latest, fund_summary.per_com,
    portno_map.begdt, portno_map.enddt, portno_map.crsp_portno, portno_map.crsp_fundno
    FROM fund_summary
    INNER JOIN portno_map 
    ON fund_summary.crsp_fundno = portno_map.crsp_fundno
    AND caldt BETWEEN portno_map.begdt AND portno_map.enddt;
'''

fund_summary_merged = ps.sqldf(sqlcode,locals())

In [19]:
portno_map.sample()

Unnamed: 0,crsp_fundno,crsp_portno,begdt,enddt
75199,94566,1050877,2018-12-01,2018-12-31


In [20]:
row_info.sample()

Unnamed: 0,crsp_portno,report_dt,port_id
44562,1001537,2006-06-30,1001537016982


In [132]:
sqlcode = '''
select row_info.crsp_portno, row_info.report_dt, 
portno_map.begdt, portno_map.enddt, portno_map.crsp_portno, portno_map.crsp_fundno
from row_info 
inner join portno_map 
on row_info.crsp_portno = portno_map.crsp_portno
and row_info.report_dt between portno_map.begdt and portno_map.enddt;
'''

row_info_merged = ps.sqldf(sqlcode,locals())

In [133]:
columns = ['report_dt','begdt','enddt']
row_info_merged[columns] = row_info_merged[columns].apply(pd.to_datetime)

In [134]:
row_info_merged.sample(50)

Unnamed: 0,crsp_portno,report_dt,begdt,enddt,crsp_portno.1,crsp_fundno
1817655,1029847,2011-12-31,2010-06-30,2018-12-31,1029847,47593
1924142,1031156,2013-06-30,2011-06-01,2018-12-31,1031156,52696
1013836,1024358,2013-08-31,2010-06-30,2018-12-31,1024358,17318
1769317,1029400,2011-08-31,2010-06-30,2018-12-31,1029400,46590
866783,1023358,2016-02-29,2010-06-30,2018-12-31,1023358,3990
955156,1023967,2011-08-31,2010-06-30,2018-12-31,1023967,13406
1291158,1026068,2013-03-31,2010-06-30,2018-12-31,1026068,3946
1116370,1024890,2015-10-31,2010-06-30,2018-12-31,1024890,40465
2039983,1032813,2015-06-30,2013-04-01,2018-12-31,1032813,57932
1753495,1029251,2014-11-30,2010-06-30,2018-12-31,1029251,43094


# Save holdings_summary

In [None]:
path = '../data/interim/holdings_summary_total.feather'
feather.write_dataframe(holdings_summary,path)

In [None]:
path = '../data/interim/holdings_summary_total.feather'
holdings_summary = feather.read_dataframe(path)

### Add fund_no to holdings_summary

#### Fundo is not an integer for now but not that important -> TODO

In [None]:
portno_map_unique = portno_map.drop_duplicates(subset='crsp_portno')

#### Maybe must be modified since all but one associated fund_nos per portfolio are deleted 

In [None]:
holdings_summary = holdings_summary.merge(portno_map_unique[['crsp_portno','crsp_fundno']],how='left', left_on = 'port_no', right_on='crsp_portno')

In [None]:
holdings_summary.shape

In [None]:
mask = holdings_summary['crsp_fundno'].notna()
holdings_summary['crsp_fundno'] = holdings_summary.loc[mask,'crsp_fundno'].astype(int)

In [None]:
holdings_summary.head(1)

In [None]:
holdings_summary = holdings_summary.rename(columns={'crsp_fundno':'fund_no'}, index=str)

# Take sample according to parameter

### Filter returns

In [None]:
# Convert to date format and filter based on date
returns['date'] =  pd.to_datetime(returns['caldt'], format='%Y-%m-%d')

In [None]:
unique_portno = holdings_summary[['fund_no']].drop_duplicates()

In [None]:
mask = returns['crsp_fundno'].isin(unique_portno['fund_no'])

In [None]:
returns_s = returns[mask]

In [None]:
print('Shape of returns before filtering',
     returns.shape)

print('Shape of returns after filtering ',
     returns_s.shape)

# Filter holdings

### Mask to filter out only those in the sample according to holdings_summary

In [None]:
mask = (holdings_summary['sample'] == 'Y') 
np.sum(mask)

In [None]:
holdings_s = holdings[mask.values]
holdings_s

In [None]:
holdings_summary_s = holdings_summary[mask]

### Filter holdings summary

### Take only last n number of obs per fund (Not needed at the moment)

To avoid overweight of funds with many observations

last_n = 5

holdings_summary_s = holdings_summary_s.reset_index()
index = pd.DataFrame(np.arange(holdings_summary_s.shape[0]))
index = index.groupby(holdings_summary_s['port_no']).tail(last_n)
index = index.values.T.flatten()

holdings_summary_s = holdings_summary_s.reset_index().loc[index,:]
holdings_s = holdings_s[index]

print('Observations left after taking only the last {} observations per fund:'.format(last_n))
holdings_s.shape

### Delet columns (stocks) with little to no information
Delet all colums / stocks which occur no more than 'min_observations_per_stock' times

In [None]:
min_observations_per_stock = 50

In [None]:
holdings_b = sparse.csr_matrix(holdings_s, copy=True)
holdings_b.data = np.ones(len(holdings_s.data))

col_sums = pd.DataFrame(holdings_b.sum(0)).values.flatten()

In [None]:
print('Total number of securities:               {:>10,d}'.format(len(col_sums)))
print('Total number of securities with >1:       {:>10,d}'.format(sum(col_sums > 1)))
print('Total number of securities with >10:      {:>10,d}'.format(sum(col_sums > 10)))
print('Total number of securities with >50:      {:>10,d}'.format(sum(col_sums > 50)))

In [None]:
# generate mask to delet some stocks
mask = col_sums > min_observations_per_stock

In [None]:
holdings_s = holdings_s.tocsc()
holdings_s = holdings_s[:,mask]
holdings_s = holdings_s.tocsr()
holdings_s

In [None]:
holdings_b = holdings_b.tocsc()
holdings_b = holdings_b[:,mask]
holdings_b = holdings_b.tocsr()
holdings_b

In [None]:
stock_map = stock_map[mask.T]

### Save final cleaned and filtered data

#### Sparse matrix

In [None]:
path = '../data/processed/holdings_s'
sparse.save_npz(path, holdings_s)

In [None]:
path = '../data/processed/holdings_b'
sparse.save_npz(path, holdings_b)

#### Sparse info

In [None]:
holdings_summary = holdings_summary.drop(columns=['index_fund_flag','et_flag','mutual_fund','sample','crsp_portno'])

In [None]:
holdings_summary_s['report_dt'] = pd.to_datetime(holdings_summary_s['report_dt'], format='%Y-%m-%d').dt.date

In [None]:
path = '../data/processed/holdings_summary_s.feather'
feather.write_dataframe(holdings_summary_s,path)

#### Returns

In [None]:
# Convert to date format and filter based on date
begin_date = datetime.date.fromisoformat('2003-01-01')
returns['caldt'] =  pd.to_datetime(returns['caldt'], format='%Y-%m-%d').dt.date
returns_s = returns[returns['caldt'] > begin_date]

In [None]:
path = '../data/processed/returns_s.feather'
feather.write_dataframe(returns_s,path)

#### Stock Map

In [None]:
path = '../data/processed/stock_map.feather'
feather.write_dataframe(stock_map,path)

## Take smaller sub_sub sample (Everything before specified year)
Makes processing faster

In [None]:
start_date = datetime.date.fromisoformat('2015-01-01')
end_date = datetime.date.fromisoformat('2018-01-01')

#### Holdings & holdings_summary

In [None]:
mask = (holdings_summary_s['report_dt'] > start_date) & (holdings_summary_s['report_dt'] < end_date)

In [None]:
holdings_s_s = holdings_s[mask.values]
holdings_b_b = holdings_b[mask.values]

In [None]:
holdings_summary_s_s = holdings_summary_s[mask]

#### Test

In [None]:
holdings_s_s.shape

In [None]:
holdings_summary_s_s.shape

#### Returns

In [None]:
mask = (returns_s['caldt'] > start_date) & (returns_s['caldt'] < end_date)
returns_s_s = returns_s[mask]

### Save final cleaned and filtered data

#### Sparse matrix

In [None]:
path = '../data/processed/holdings_s_s'
sparse.save_npz(path, holdings_s_s)

path = '../data/processed/holdings_b_b'
sparse.save_npz(path, holdings_b_b)

#### Sparse info

In [None]:
path = '../data/processed/holdings_summary_s_s.feather'
feather.write_dataframe(holdings_summary_s_s,path)

#### Returns

In [None]:
path = '../data/processed/returns_s_s.feather'
feather.write_dataframe(returns_s_s,path)