<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Clean" data-toc-modified-id="Clean-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Clean</a></span></li><li><span><a href="#Save-everything" data-toc-modified-id="Save-everything-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Save everything</a></span></li></ul></div>

# Analysis of holdings

Description:



In [None]:
import feather
import numpy as np
import pandas as pd
import pandasql as ps

from scipy import sparse

from sklearn.preprocessing import normalize, minmax_scale

import matplotlib.pyplot as plt

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info_f.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info_f.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings_f.npz'
holdings = sparse.load_npz(path)

## Clean

In [None]:
def filter_data(year, verbose = True):
    min_obs_per_securities = 5
    min_obs_per_fund = 10
        
    row_info_f = row_info.copy()
    col_info_f = col_info.copy()
    
    row_info_f = row_info_f.query('year == @year')

    begin_date = row_info_f.iloc[0,:]['report_dt']
    end_date = begin_date + pd.DateOffset(years=1) # 1 year offset
    row_info_f.reset_index(drop = True, inplace=True)

    # Filter returns
    crsp_fundno_unique = row_info_f['crsp_fundno'].unique()
    returns_f = returns.copy()
    query = '''report_dt >= @begin_date and report_dt <= @end_date and crsp_fundno in @crsp_fundno_unique'''
    returns_f = returns_f.query(query)

    # Change return of month for which holdings apply to 0
    returns_f = returns_f.copy()
    mask = returns_f['report_dt'] == begin_date
    returns_f.loc[mask,'mret'] = 0
    
    # Drop all funds with first return observation after starting date
    drop_fundnos = returns_f.drop_duplicates('crsp_fundno').query('mret != 0')['crsp_fundno']
    returns_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    row_info_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    
    # Filter holdings accordingly
    holdings_f = holdings.copy()
    holdings_f = holdings[row_info_f['row']]
    
    
    # Delet all securities with less than X observations
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))

    sum_sec_boolean = holdings_b.toarray().sum(0)
    col_mask = (sum_sec_boolean >= min_obs_per_securities).flatten()
    
    holdings_f = holdings_f.tocsc()
    holdings_f = holdings_f[:,col_mask]
    holdings_f = holdings_f.tocsr()
    
    col_info_f = col_info_f.loc[col_mask,:]
    
    # Delet all funds with less than X securities
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))
    
    sum_fund_boolean = holdings_b.toarray().sum(1)
    row_mask = (sum_fund_boolean >= min_obs_per_fund).flatten()
    
    holdings_f = holdings_f[row_mask]
    row_info_f = row_info_f[row_mask]
    
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))
    
    ## Preprocessing
    preprocessing = 'l2' # hardcoded since always used
    if (preprocessing == 'none'): holdings_ft = holdings_f
    if (preprocessing == 'l1'):   holdings_ft = normalize(holdings_f, norm = 'l1')
    if (preprocessing == 'l2'):   holdings_ft = normalize(holdings_f, norm = 'l2')

    
    if (verbose):
        # Check if all dimensions match
        print('Number of fund/date combinations:        {:12,d}'.format(holdings_ft.shape[0]))
        print('Number of unique securities:             {:12,d}'.format(holdings_ft.shape[1]))
        print('Number of values in sparse matrix:       {:12,d}'.format(holdings_ft.getnnz()))
        match_test = ((holdings_ft.shape[0] == holdings_b.shape[0]) 
                      & (holdings_ft.shape[1] == holdings_b.shape[1]) 
                      & (holdings_ft.getnnz() == holdings_b.getnnz()))
        print('Same values for boolean holdings matrix:         {}'.format(match_test))
        print()
        print('Number of rows in row_info df:           {:12,d}'.format(row_info_f.shape[0]))
        print('Number of rows in col_inf df:            {:12,d}'.format(col_info_f.shape[0]))
        print()
        match_test = (holdings_ft.shape[0] == row_info_f.shape[0]) & (holdings_ft.shape[1] == col_info_f.shape[0])
        print('Everything matches:                              {}'.format(match_test))
    
    return(row_info_f, col_info_f, returns_f, holdings_ft, holdings_b, begin_date, end_date)

In [None]:
row_info_f, col_info_f, returns_f, holdings_ft, holdings_b, begin_date, end_date = filter_data(2018)

In [None]:
full = [2010,2011,2012,2013,2014,2015,2016,2017,2018]

dict_all_years = {}

for year in full:
    row_info_f, col_info_f, returns_f, holdings_ft, holdings_b, begin_date, end_date = filter_data(year)
    dict_year_temp = {
    'row_info_f' : row_info_f, 
    'col_info_f' : col_info_f,
    'returns_f' : returns_f, 
    'holdings_ft' : holdings_ft, 
    'holdings_b' : holdings_b, 
    'begin_date' : begin_date, 
    'end_date' : end_date
    }
    
    dict_all_years[year] = dict_year_temp

In [None]:
dict_all_years.get(2014).get('holdings_ft')

In [None]:
for year in full:
    print(year)
    plt.plot(dict_all_years.get(year).get('holdings_b').sum(0).T)
    plt.show()

## Save everything

In [None]:
import pickle

In [None]:
path = '../data/processed/full.pickle'

pickling_on = open(path,"wb")
pickle.dump(dict_all_years, pickling_on)
pickling_on.close()