<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Clean" data-toc-modified-id="Clean-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Clean</a></span></li><li><span><a href="#Grouping" data-toc-modified-id="Grouping-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Grouping</a></span><ul class="toc-item"><li><span><a href="#Group-based-on-different-columns" data-toc-modified-id="Group-based-on-different-columns-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Group based on different columns</a></span></li><li><span><a href="#Combine-groupings-into-final-grouping" data-toc-modified-id="Combine-groupings-into-final-grouping-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Combine groupings into final grouping</a></span></li></ul></li><li><span><a href="#Apply-grouping" data-toc-modified-id="Apply-grouping-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Apply grouping</a></span><ul class="toc-item"><li><span><a href="#Apply-on-holdings-data" data-toc-modified-id="Apply-on-holdings-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Apply on holdings data</a></span></li><li><span><a href="#Apply-on-col_info" data-toc-modified-id="Apply-on-col_info-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Apply on col_info</a></span></li></ul></li><li><span><a href="#Merging-with-CCM-Table" data-toc-modified-id="Merging-with-CCM-Table-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Merging with CCM Table</a></span></li><li><span><a href="#Filter-based-on-permco" data-toc-modified-id="Filter-based-on-permco-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Filter based on permco</a></span></li><li><span><a href="#Inspecting-effect-of-cleaning" data-toc-modified-id="Inspecting-effect-of-cleaning-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Inspecting effect of cleaning</a></span></li><li><span><a href="#Holdings-Tests" data-toc-modified-id="Holdings-Tests-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Holdings Tests</a></span></li><li><span><a href="#Save-everything" data-toc-modified-id="Save-everything-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Save everything</a></span></li></ul></div>

Findings:

Nearly all 'ord' stocks don't have perno or permco

Some stocks appear twice (at least based on name)

# Analysis of holdings

Description:



In [None]:
import feather
import numpy as np
import pandas as pd
import pandasql as ps

from scipy import sparse

from sklearn.preprocessing import normalize, minmax_scale

import matplotlib.pyplot as plt

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

## Linktable
path = '../data/raw/ccm_link.feather'
ccm_link = feather.read_dataframe(path)

## Clean

In [None]:
print('Shape of holdings:                {:10,d} / {:10,d}'.format(holdings.shape[0],holdings.shape[1]))
print('Numer of unique securities:                    {:10,d}'.format(col_info.shape[0]))
print('Numer of unique funds:                         {:10,d}'.format(row_info.shape[0]))

In [None]:
# Setup boolean matrix
holdings.eliminate_zeros()
holdings_b = sparse.csr_matrix(holdings, copy=True)
holdings_b.data = np.ones(len(holdings.data))
holdings_b.eliminate_zeros()

In [None]:
# Delet all securities with zero or only one occurence
sum_sec = pd.DataFrame(holdings_b.sum(0).T).values
col_mask = (sum_sec >= 1).flatten()

col_info = col_info[col_mask]
col_info.reset_index(drop=True,inplace=True)
col_info = col_info.assign(col = col_info.index)
col_info = col_info.drop(columns = 'col_old')

holdings = holdings.tocsc()
holdings = holdings[:,col_mask]
holdings = holdings.tocsr()

In [None]:
print('Shape of holdings:                {:10,d} / {:10,d}'.format(holdings.shape[0],holdings.shape[1]))
print('Numer of unique securities:                    {:10,d}'.format(col_info.shape[0]))
print('Numer of unique funds:                         {:10,d}'.format(row_info.shape[0]))

In [None]:
# Add sum and also check if sums match later on
col_info['sum'] = pd.DataFrame(holdings.sum(0).T).values
col_info.sort_values('sum',ascending = False).head(10)

In [None]:
# Reset index and sort
col_info = col_info.sort_values(['permco','cusip','security_name'])
col_info_old = col_info.copy()   # For comparision later 
# Also sort holdings data
sort_index = col_info['col'].index

holdings = holdings.tocsc()
holdings = holdings[:,sort_index]
holdings = holdings.tocsr()

holdings_b = holdings_b.tocsc()
holdings_b = holdings_b[:,sort_index]
holdings_b = holdings_b.tocsr()

# Reset cols
col_info = col_info.reset_index(drop=True)
col_info = col_info.assign(col=col_info.index)

# Gen short Cuisp identifying securities and differentiating between equity and debt 
col_info['cusip_short'] = col_info['cusip'].astype(str).str[:7] 
# 7: differentiate between debt and equity / 6: do not differentiate

col_info['cusip_short_f'] = col_info['cusip_short'].astype(str).str.replace('[0-9]$','@')
col_info['cusip_short_f'] = col_info['cusip_short_f'].astype(str).str.replace('[A-Z]$','D')
col_info['cusip_short_f'] = col_info['cusip_short_f'].astype(str).str.replace('@','E')
col_info['cusip_short_f'] = col_info['cusip_short_f'].replace('None',np.nan)
col_info['type'] = col_info['cusip_short'].astype(str).str[6:7] # Inidcating equity or debt

# Replacing common name suffixes
col_info['security_name_adj'] = col_info['security_name'].str.replace(' ORD$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' EQUITY OPTION$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' EQUITY SWAP$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' CFD$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' CALL$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' PUT$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' PFD$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' TOTAL RETURN EQUITY SWAP$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' TRS$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' DR$','')

col_info = col_info[['security_name', 'security_name_adj', 'crsp_company_key',
                     'cusip', 'cusip_short', 'cusip_short_f', 
                     'permno', 'permco', 'ticker', 'sum','col']]

In [None]:
col_info[col_info['security_name'].str.contains('GENERAL MOTORS')].sort_values('sum')

## Grouping

### Group based on different columns

In [None]:
col_info.sample()

In [None]:
# Based on same cusip
cusip_col = col_info.groupby('cusip_short_f')[['col']].first()
cusip_col.columns = ['col_n1']
col_info = col_info.merge(cusip_col, how= 'left', left_on='cusip_short_f', right_on= cusip_col.index)

# Based on same Permco
cusip_col = col_info.groupby('permco')[['col']].first()
cusip_col.columns = ['col_n2']
col_info = col_info.merge(cusip_col, how= 'left', left_on='permco', right_on= cusip_col.index)

# Based on same Ticker
cusip_col = col_info.groupby('ticker')[['col']].first()
cusip_col.columns = ['col_n3']
col_info = col_info.merge(cusip_col, how= 'left', left_on='ticker', right_on= cusip_col.index)

# Based on same Name
cusip_col = col_info.groupby('security_name_adj')[['col']].first()
cusip_col.columns = ['col_n4']
col_info = col_info.merge(cusip_col, how= 'left', left_on='security_name_adj', right_on= cusip_col.index)

### Combine groupings into final grouping

In [None]:
# Replace col_final with cols based on cusip if cusip is not missing
mask = col_info['cusip_short_f'].notna()
col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n1']

# Replace col_final with cols based on permco if permco is not missing & col_final is still missing
mask = col_info.query(''' (permco == permco) & (col_final != col_final)''').index
col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n2']

# # Replace col_final with cols based on ticker if ticker is not missing & col_final is still missing
# mask = col_info.query(''' (ticker == ticker) & (col_final != col_final)''').index
# col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n3']

# Replace col_final with cols based on security_name if security_name is not missing & col_final is still missing
mask = col_info.query(''' (security_name_adj == security_name_adj) & (col_final != col_final)''').index
col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n4']

In [None]:
t = 2000
col_info.sort_values('security_name').iloc[t:t+10,:]

## Apply grouping

In [None]:
col_info.shape[0]

### Apply on holdings data

In [None]:
holdings_df = pd.DataFrame(holdings.todense()).T
holdings_df = holdings_df.groupby(col_info['col_final']).sum()
holdings_df = holdings_df.T

holdings_df = sparse.csr_matrix(holdings_df)

In [None]:
holdings

In [None]:
holdings_df

In [None]:
holdings = holdings_df

In [None]:
col_info.shape

### Apply on col_info

In [None]:
col_info = col_info.sort_values(['permco','cusip_short_f','security_name']).groupby('col_final').first()
col_info['sum'] = pd.DataFrame(holdings.sum(0).T).values
col_info.reset_index(drop=True,inplace=True)
col_info = col_info.assign(col = col_info.index)
# We don't need to sort again since groupby().sum() and groupby().first() keep the order correct

In [None]:
col_info.shape[0]

In [None]:
# Add sum and also check if sums match later on
col_info['sum'] = pd.DataFrame(holdings.sum(0).T).values
col_info.sort_values('sum',ascending = False).head(10)

## Merging with CCM Table

In [None]:
## Linktable
path = '../data/raw/ccm_link.feather'
ccm_link = feather.read_dataframe(path)

In [None]:
# Fix data types
ccm_link['gvkey'] = ccm_link['gvkey'].astype(float)
col_info['permno'] = col_info['permno'].astype(float)
col_info['permco'] = col_info['permco'].astype(float)

columns = ['linkdt','linkenddt']
ccm_link[columns] = ccm_link[columns].apply(pd.to_datetime)

mask = ccm_link['linkenddt'].isna()
ccm_link.loc[mask,'linkenddt'] = pd.to_datetime('2019-12-31')

In [None]:
# Delet unnecessary columns
ccm_link = ccm_link.dropna(subset = ['lpermno','lpermco'], how = 'all')

# Filter only reliable primary links
linktypes = ['LU','LC'] # KEEP reliable LINKS only
linkprims = ['P','C']    # KEEP primary Links
ccm_link.query(''' linktype in @linktypes and linkprim in @linkprims ''', inplace = True)

# Just for testing
col_info = col_info.assign(report_dt = pd.to_datetime('2010-01-01'))

In [None]:
# Match gvkey onto col_info
sqlcode = '''
    SELECT *
    FROM col_info 
    LEFT JOIN ccm_link 
        ON permno = lpermno;
'''
# AND report_dt between linkdt AND linkenddt
col_info_m = ps.sqldf(sqlcode,locals())

# Take only important columns
columns = [
    'col', 'security_name',
    'cusip', 'cusip_short_f', 'permno', 'permco', 'gvkey', 'ticker',
    'sum', 'linkdt', 'linkenddt',
]
col_info_m = col_info_m[columns]

# Change dates to datetime format
columns = ['linkdt','linkenddt']
col_info_m[columns] = col_info_m[columns].apply(pd.to_datetime)

# drop duplicates due to multiple gvkeys for single permno depending on point in time (TODO Fix later)
col_info_m = col_info_m.drop_duplicates(['security_name','cusip'])
col_info_m = col_info_m.reset_index(drop=True)

In [None]:
holdings

In [None]:
col_info.shape

In [None]:
col_info_m.shape

## Filter based on permco

In [None]:
holdings

In [None]:
# Delet all securities with permno = na
col_info_final = col_info_m[col_info_m['permno'].notna()]
col_mask = col_info_final.index

col_info.reset_index(drop=True,inplace=True)
col_info = col_info.assign(col = col_info.index)

In [None]:
# Filter holdings accordingly
holdings = holdings.tocsc()
holdings = holdings[:,col_mask]
holdings = holdings.tocsr()

In [None]:
holdings

## Inspecting effect of cleaning

In [None]:
def perc_missing(data, cols):
    
    print('\nBefore cleaning:')
    print('Numer of unique securities:                       {:10,d}'.format(data.shape[0]))
    print('Total TNA:                                        {:10,d}'.format(data['sum'].sum().astype(int)))
    print('')

    print('{:<40s}{:>20s}{:>5s}{:>20s}'.format('Identifier','Percent missing','','% of TNA'))

    for col in cols:
        missing = np.sum(data[col].isna()) / data.shape[0]

        mask = data[col].isna()
        perc_tna = data.loc[mask,'sum'].sum() / data.loc[:,'sum'].sum()
        print('{:<40s}{:>20.2%}{:>5s}{:>20.2%}'.format(col,missing,'',perc_tna))

print('Raw data')
print('-'*40)
cols = ['cusip', 'permno', 'permco', 'ticker']
perc_missing(col_info_old, cols)

print('\nAfter cleaning and grouping')
print('-'*40)
cols = ['cusip', 'permno', 'permco', 'ticker','gvkey']
perc_missing(col_info_m, cols)

print('\nAfter filtering')
print('-'*40)
cols = ['cusip', 'permno', 'permco', 'ticker','gvkey']
perc_missing(col_info_final, cols)

## Holdings Tests

In [None]:
# Add sum and also check if sums match later on
col_info_final['sum'] = pd.DataFrame(holdings.sum(0).T).values
col_info_final.sort_values('sum',ascending = False).head(10)

## Save everything

In [None]:
holdings

In [None]:
col_info_final

In [None]:
# Holdings data
path = '../data/processed/holdings_f.npz'
sparse.save_npz(path,holdings)

path = '../data/processed/row_info_f.feather'
feather.write_dataframe(row_info,path)

path = '../data/processed/col_info_f.feather'
feather.write_dataframe(col_info_final,path)