<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Clean" data-toc-modified-id="Clean-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Clean</a></span></li><li><span><a href="#Grouping" data-toc-modified-id="Grouping-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Grouping</a></span><ul class="toc-item"><li><span><a href="#Group-based-on-different-columns" data-toc-modified-id="Group-based-on-different-columns-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Group based on different columns</a></span></li><li><span><a href="#Combine-groupings-into-final-grouping" data-toc-modified-id="Combine-groupings-into-final-grouping-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Combine groupings into final grouping</a></span></li></ul></li><li><span><a href="#Apply-grouping" data-toc-modified-id="Apply-grouping-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Apply grouping</a></span><ul class="toc-item"><li><span><a href="#Apply-on-holdings-data" data-toc-modified-id="Apply-on-holdings-data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Apply on holdings data</a></span></li><li><span><a href="#Apply-on-col_info" data-toc-modified-id="Apply-on-col_info-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Apply on col_info</a></span></li></ul></li><li><span><a href="#Merging-with-CCM-Table" data-toc-modified-id="Merging-with-CCM-Table-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Merging with CCM Table</a></span></li><li><span><a href="#Inspecting-effect-of-cleaning" data-toc-modified-id="Inspecting-effect-of-cleaning-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Inspecting effect of cleaning</a></span><ul class="toc-item"><li><span><a href="#General-Overview" data-toc-modified-id="General-Overview-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>General Overview</a></span></li><li><span><a href="#Overview-of-cleaning-the-security-data-by-name" data-toc-modified-id="Overview-of-cleaning-the-security-data-by-name-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Overview of cleaning the security data by name</a></span></li><li><span><a href="#Overview-of-cleaning-by-cusip" data-toc-modified-id="Overview-of-cleaning-by-cusip-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Overview of cleaning by cusip</a></span></li><li><span><a href="#Overview-of-cleaning-by-permno" data-toc-modified-id="Overview-of-cleaning-by-permno-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>Overview of cleaning by permno</a></span></li></ul></li><li><span><a href="#Distribution-of-cusip-and-crsp_company_key" data-toc-modified-id="Distribution-of-cusip-and-crsp_company_key-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Distribution of cusip and crsp_company_key</a></span></li><li><span><a href="#Holdings-Tests" data-toc-modified-id="Holdings-Tests-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Holdings Tests</a></span></li></ul></div>

Findings:

Nearly all 'ord' stocks don't have perno or permco

Some stocks appear twice (at least based on name)

# Analysis of holdings

Description:



In [None]:
import feather
import numpy as np
import pandas as pd
import pandasql as ps

from scipy import sparse

from sklearn.preprocessing import normalize, minmax_scale

import matplotlib.pyplot as plt

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

## Linktable
path = '../data/raw/ccm_link.feather'
ccm_link = feather.read_dataframe(path)

## Clean

In [None]:
print('Shape of holdings:                {:10,d} / {:10,d}'.format(holdings.shape[0],holdings.shape[1]))
print('Numer of unique securities:                    {:10,d}'.format(col_info.shape[0]))
print('Numer of unique funds:                         {:10,d}'.format(row_info.shape[0]))

In [None]:
holdings.eliminate_zeros()
holdings_b = sparse.csr_matrix(holdings, copy=True)
holdings_b.data = np.ones(len(holdings.data))
holdings_b.eliminate_zeros()

In [None]:
sum_sec = pd.DataFrame(holdings_b.sum(0).T).values
col_mask = (sum_sec >= 2).flatten()

col_info = col_info[col_mask]
col_info.reset_index(drop=True,inplace=True)
col_info = col_info.assign(col = col_info.index)

holdings = holdings.tocsc()
holdings = holdings[:,col_mask]
holdings = holdings.tocsr()

In [None]:
holdings_b = holdings
print('Shape of holdings:                {:10,d} / {:10,d}'.format(holdings.shape[0],holdings.shape[1]))
print('Numer of unique securities:                    {:10,d}'.format(col_info.shape[0]))
print('Numer of unique funds:                         {:10,d}'.format(row_info.shape[0]))

In [None]:
col_info['sum'] = pd.DataFrame(holdings.sum(0).T).values

col_info = col_info.sort_values(['cusip','permco'])
col_info = col_info.reset_index(drop=True)
col_info = col_info.assign(col=col_info.index)

col_info['cusip_short'] = col_info['cusip'].astype(str).str[:7]

col_info['cusip_short_f'] = col_info['cusip_short'].astype(str).str.replace('[0-9]$','@')
col_info['cusip_short_f'] = col_info['cusip_short_f'].astype(str).str.replace('[A-Z]$','D')
col_info['cusip_short_f'] = col_info['cusip_short_f'].astype(str).str.replace('@','E')

col_info['cusip_short_f'] = col_info['cusip_short_f'].replace('None',np.nan)

col_info['type'] = col_info['cusip_short'].astype(str).str[6:7]

# Replacing common name suffixes
col_info['security_name_adj'] = col_info['security_name'].str.replace(' ORD$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' EQUITY OPTION$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' EQUITY SWAP$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' CFD$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' CALL$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' PUT$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' PFD$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' TOTAL RETURN EQUITY SWAP$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' TRS$','')
col_info['security_name_adj'] = col_info['security_name_adj'].str.replace(' DR$','')

col_info = col_info[['security_name', 'security_name_adj', 'crsp_company_key',
                     'cusip', 'cusip_short', 'cusip_short_f', 
                     'permno', 'permco', 'ticker', 'sum','col']]

In [None]:
col_info[col_info['security_name'].str.contains('GENERAL MOTORS')].sort_values('sum')

## Grouping

### Group based on different columns

In [None]:
col_info.sample()

In [None]:
# Based on same cusip
cusip_col = col_info.groupby('cusip_short_f')[['col']].first()
cusip_col.columns = ['col_n1']
col_info = col_info.merge(cusip_col, how= 'left', left_on='cusip_short_f', right_on= cusip_col.index)

# Based on same Permco
cusip_col = col_info.groupby('permco')[['col']].first()
cusip_col.columns = ['col_n2']
col_info = col_info.merge(cusip_col, how= 'left', left_on='permco', right_on= cusip_col.index)

# Based on same Ticker
cusip_col = col_info.groupby('ticker')[['col']].first()
cusip_col.columns = ['col_n3']
col_info = col_info.merge(cusip_col, how= 'left', left_on='ticker', right_on= cusip_col.index)

# Based on same Name
cusip_col = col_info.groupby('security_name_adj')[['col']].first()
cusip_col.columns = ['col_n4']
col_info = col_info.merge(cusip_col, how= 'left', left_on='security_name_adj', right_on= cusip_col.index)

### Combine groupings into final grouping

In [None]:
# Replace col_final with cols based on cusip if cusip is not missing
mask = col_info['cusip_short_f'].notna()
col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n1']

# Replace col_final with cols based on permco if permco is not missing & col_final is still missing
mask = col_info.query(''' (permco == permco) & (col_final != col_final)''').index
col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n2']

# # Replace col_final with cols based on ticker if ticker is not missing & col_final is still missing
# mask = col_info.query(''' (ticker == ticker) & (col_final != col_final)''').index
# col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n3']

# Replace col_final with cols based on security_name if security_name is not missing & col_final is still missing
mask = col_info.query(''' (security_name_adj == security_name_adj) & (col_final != col_final)''').index
col_info.loc[mask,'col_final'] = col_info.loc[mask,'col_n4']

In [None]:
t = 2000
col_info.sort_values('col').sort_values('security_name').iloc[t:t+10,:]

## Apply grouping

In [None]:
col_info.shape[0]

### Apply on holdings data

In [None]:
holdings_df = pd.DataFrame(holdings.todense()).T
holdings_df = holdings_df.groupby(col_info['col_final']).sum()
holdings_df = holdings_df.T

holdings_df = sparse.csr_matrix(holdings_df)

In [None]:
holdings

In [None]:
holdings_df

In [None]:
holdings = holdings_df

### Apply on col_info

In [None]:
col_info = col_info.sort_values(['permco','cusip_short_f','security_name']).groupby('col_final').first()
col_info['sum'] = pd.DataFrame(holdings.sum(0).T).values

In [None]:
col_info.shape[0]

## Merging with CCM Table

In [None]:
## Linktable
path = '../data/raw/ccm_link.feather'
ccm_link = feather.read_dataframe(path)

In [None]:
# Fix data types
ccm_link['gvkey'] = ccm_link['gvkey'].astype(float)
col_info['permno'] = col_info['permno'].astype(float)
col_info['permco'] = col_info['permco'].astype(float)

columns = ['linkdt','linkenddt']
ccm_link[columns] = ccm_link[columns].apply(pd.to_datetime)

mask = ccm_link['linkenddt'].isna()
ccm_link.loc[mask,'linkenddt'] = pd.to_datetime('2019-12-31')

In [None]:
# Delet unnecessary columns
ccm_link = ccm_link.dropna(subset = ['lpermno','lpermco'], how = 'all')

linktypes = ['LU','LC'] # KEEP reliable LINKS only
linkprims = ['P','C']    # KEEP primary Links
ccm_link.query(''' linktype in @linktypes and linkprim in @linkprims ''', inplace = True)

In [None]:
ccm_link.query(''' linkenddt > 2015 ''').shape

In [None]:
col_info = col_info.assign(report_dt = pd.to_datetime('2010-01-01'))

In [None]:
sqlcode = '''
    SELECT *
    FROM col_info 
    LEFT JOIN ccm_link 
        ON permno = lpermno
        AND report_dt between linkdt AND linkenddt;
'''

col_info_m = ps.sqldf(sqlcode,locals())

In [None]:
columns = [
    'security_name',
    'cusip', 'cusip_short_f', 'permno', 'permco', 'gvkey', 'ticker',
    'sum', 
]

col_info_m = col_info_m[columns]

In [None]:
print('Percentage of identifiers missing:')
np.sum(col_info_m[['cusip', 'cusip_short_f', 'permno', 'permco', 'gvkey', 'ticker']].isna()) / col_info_m.shape[0] * 100

In [None]:
print('Percentage of tna where identifier is not missing:')


In [None]:
cols = ['cusip', 'cusip_short_f', 'permno', 'permco', 'gvkey', 'ticker']

for col in cols:
    mask = col_info_m[col].notna()
    temp = col_info_m.loc[mask,'sum'].sum() / col_info_m.loc[:,'sum'].sum() * 100
    print('{:<30s}{:>5.2f}%'.format(col,temp))

In [None]:
col_info_m

In [None]:
col_info.shape

In [None]:
col_info_m.shape

In [None]:
col_info_fm.drop_duplicates().shape

## Inspecting effect of cleaning

### General Overview

In [None]:
col_info.shape

In [None]:
mask = col_info['sum'] != 0

In [None]:
col_names = col_info.loc[mask,'col']
print('Number of unique securities                    {:<6,d}'.format(col_names.unique().shape[0]))

col_names = col_info.loc[mask,'col_final']
print('Number of unique securities                    {:<6,d}'.format(col_names.unique().shape[0]))

### Overview of cleaning the security data by name

In [None]:
col_names = col_info['security_name']
print('Number of securities                           {:<6,d}'.format(col_names.shape[0]))
print('Number of unique securities                    {:<6,d}'.format(col_names.unique().shape[0]))

In [None]:
col_names = col_names.str.replace(' ADR$','')
col_names = col_names.str.replace(' DR$','')
print('Number of unique securities without ORD        {:<6,d}'.format(col_names.unique().shape[0]))

In [None]:
#col_info['col_n'] = col_info[['cusip_short_f','col']].groupby('cusip_short_f').apply(lambda x : x.head(1).repeat(x.shape[0])).values
col_info['col_n'] = col_info[['cusip_short_f','col']].groupby('cusip_short_f').agg('first')

In [None]:
a = col_names[col_names.str.contains('Fund', case = False)]
print('Number of securities with fund in the name              {:<6,d}'.format(a.shape[0]))

### Overview of cleaning by cusip

In [None]:
col_names = col_info['cusip']
print('Number of securities                               {:>10,d}'.format(col_names.shape[0]))
print('Number of unique securities by cusip               {:>10,d}'.format(col_names.unique().shape[0]))
col_names = col_info['cusip_short']    
print('Number of unique securities by short cusip         {:>10,d}'.format(col_names.unique().shape[0]))

col_names = col_info['cusip_short_f']
print('Number of unique securities by short cusip D/E     {:>10,d}'.format(col_names.unique().shape[0]))

### Overview of cleaning by permno

In [None]:
col_names = col_info['permno']
print('Number of securities                               {:>6,d}'.format(col_names.shape[0]))
print('Number of unique securities by permno              {:>6,d}'.format(col_names.unique().shape[0]))
perc = col_info.loc[~col_info['permno'].isna(),'sum'].sum() / col_info.loc[:,'sum'].sum() * 100
print('Percent of holdings in securities with permno      {:>6.2f}%'.format(perc))



## Distribution of cusip and crsp_company_key

In [None]:
print('Percentage of identifiers missing:')
np.sum(col_info_c[['crsp_company_key','cusip_short_f','permno','permco','ticker']].isna()) / col_info.shape[0] * 100

In [None]:
print('Percentage of identifiers missing:')
np.sum(col_info_c.loc[mask,['crsp_company_key','cusip_short_f','permno','permco','ticker']].isna()) / col_info_c.loc[mask].shape[0] * 100

## Holdings Tests

In [None]:
# Generate sparse holdings matrix with boolean values instead of more precise percent_tna values
holdings_b = sparse.csr_matrix(holdings_ft, copy=True)
holdings_b.data = np.ones(len(holdings_ft.data))

In [None]:
def most_common_stocks():
    # Change if you want to sum percentages instead of boolean values
    holdings_coo = holdings_b.tocoo()

    df_sparse = pd.DataFrame({'row' : holdings_coo.row,
                              'col' : holdings_coo.col,
                              'data' : holdings_coo.data})

    df_sparse = df_sparse.merge(row_info_f[['year','row','lipper_class']],how='left',on='row')

    no_unique_funds = row_info_f.shape[0]

    sum_col = (df_sparse
               .groupby(by = ['col'])
               .sum()
               .sort_values('data',ascending = False)
               .join(col_info_f[['security_name','col','crsp_company_key','cusip','permno','permco']],how='left')
               .assign(percent = lambda x:  x.data / no_unique_funds * 100)
               .drop(columns=['row','year','col'])
               .reset_index(drop=True)
               .head(100))

    print(
        'Most held stocks: ','\n\n'
        'Number of funds:    {}'.format(no_unique_funds),'\n'
    )

    return sum_col

In [None]:
most_common_stocks()

In [None]:
holdings_summary = pd.DataFrame(holdings_ft.sum(0).T)
holdings_summary.columns = ['total']
holdings_summary.shape