<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Other-functions" data-toc-modified-id="Other-functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Other functions</a></span><ul class="toc-item"><li><span><a href="#Filter" data-toc-modified-id="Filter-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Filter</a></span></li></ul></li><li><span><a href="#macth-number-of-occurences-to-col_info" data-toc-modified-id="macth-number-of-occurences-to-col_info-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>macth number of occurences to col_info</a></span></li><li><span><a href="#Clean" data-toc-modified-id="Clean-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Clean</a></span></li><li><span><a href="#Replace" data-toc-modified-id="Replace-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Replace</a></span></li><li><span><a href="#Combining-securities-with-same-cusip-/-permno-/-permco" data-toc-modified-id="Combining-securities-with-same-cusip-/-permno-/-permco-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Combining securities with same cusip / permno / permco</a></span></li><li><span><a href="#Inspecting-effect-of-cleaning" data-toc-modified-id="Inspecting-effect-of-cleaning-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Inspecting effect of cleaning</a></span><ul class="toc-item"><li><span><a href="#Overview-of-cleaning-the-security-data-by-name" data-toc-modified-id="Overview-of-cleaning-the-security-data-by-name-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Overview of cleaning the security data by name</a></span></li><li><span><a href="#Overview-of-cleaning-by-cusip" data-toc-modified-id="Overview-of-cleaning-by-cusip-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Overview of cleaning by cusip</a></span></li><li><span><a href="#Overview-of-cleaning-by-cusip" data-toc-modified-id="Overview-of-cleaning-by-cusip-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>Overview of cleaning by cusip</a></span></li></ul></li><li><span><a href="#Distribution-of-cusip-and-crsp_company_key" data-toc-modified-id="Distribution-of-cusip-and-crsp_company_key-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Distribution of cusip and crsp_company_key</a></span></li></ul></div>

Findings:

Nearly all 'ord' stocks don't have perno or permco

Some stocks appear twice (at least based on name)

# Analysis of holdings

Description:



In [None]:
import feather
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.preprocessing import normalize, minmax_scale

import matplotlib.pyplot as plt

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

## Linktable
path = '../data/raw/ccm_link.feather'
ccm_link = feather.read_dataframe(path)

In [None]:
holdings.shape

In [None]:
row_info.shape

In [None]:
col_info.shape

## Other functions

### Filter

In [None]:
def filter_data(year, preprocessing, verbose = False):
    
    year = year
        
    row_info_f = row_info.copy()
    
    row_info_f = row_info_f.query('year == @year')

    begin_date = row_info_f.iloc[0,:]['report_dt']
    end_date = begin_date + pd.DateOffset(years=1) # 1 year offset
    row_info_f.reset_index(drop = True, inplace=True)

    # Filter returns
    crsp_fundno_unique = row_info_f['crsp_fundno'].unique()
    returns_f = returns.copy()
    query = '''report_dt >= @begin_date and report_dt <= @end_date and crsp_fundno in @crsp_fundno_unique'''
    returns_f = returns_f.query(query)

    # Change return of month for which holdings apply to 0
    returns_f = returns_f.copy()
    mask = returns_f['report_dt'] == begin_date
    returns_f.loc[mask,'mret'] = 0
    
    # Drop all funds with first return observation after starting date
    drop_fundnos = returns_f.drop_duplicates('crsp_fundno').query('mret != 0')['crsp_fundno']
    returns_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    row_info_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    
    # Filter holdings accordingly and delet all securities with less than two observations
    holdings_f = holdings.copy()
    holdings_f = holdings[row_info_f['row']]
    
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))

    sum_sec_boolean = holdings_b.toarray().sum(0)
    col_mask = (sum_sec_boolean >= 1).flatten()
    
    col_info_f = col_info[col_mask]
    
    # reset rows and columns
    row_info_f.reset_index(drop=True,inplace=True)
    row_info_f = row_info_f.assign(row = row_info_f.index)

    col_info_f.reset_index(drop=True,inplace=True)
    col_info_f = col_info_f.assign(col = col_info_f.index)

    
    holdings_f = holdings_f.tocsc()
    holdings_f = holdings_f[:,col_mask]
    holdings_f = holdings_f.tocsr()
    
    holdings_b = holdings_b.tocsc()
    holdings_b = holdings_b[:,col_mask]
    holdings_b = holdings_b.tocsr()
    
    ## Preprocessing
    if (preprocessing == 'none'): holdings_ft = holdings_f
    if (preprocessing == 'l1'):   holdings_ft = normalize(holdings_f, norm = 'l1')
    if (preprocessing == 'l2'):   holdings_ft = normalize(holdings_f, norm = 'l2')

    
    if (verbose):
        print('Shape of holdings:          {:6,d} / {:6,d}'.format(holdings_ft.shape[0],holdings_ft.shape[1]))

        print('Numer of unique securities:      {:10,d}'.format(col_info_f.shape[0]))
        print('Numer of unique funds:           {:10,d}'.format(row_info_f.shape[0]))

        print('Begin date:                      {}'.format(begin_date.date()))
        print('End date:                        {}'.format(end_date.date()))
    
    return(row_info_f, col_info_f, returns_f, holdings_ft, holdings_b, begin_date, end_date)

In [None]:
row_info_f, col_info_f, returns_f, holdings_ft, holdings_b, begin_date, end_date = filter_data(2016,'none',verbose=True)

In [None]:
col_info = col_info_f

## macth number of occurences to col_info

In [None]:
sec_sums = pd.DataFrame(holdings_b.sum(0).flatten()).T

In [None]:
col_info['sum'] = sec_sums

In [None]:
col_info.shape

In [None]:
t = 2000
col_info.sort_values('security_name').iloc[t:t+30,:]

In [None]:
print(col_info.reset_index().loc[:6079,'sum'].sum())
print(col_info.reset_index().loc[6079:,'sum'].sum())

## Clean

In [None]:
# Replacing ' ORD' in name
col_info['security_name'] = col_info['security_name'].str.replace(' ORD$','')
#col_info['security_name'] = col_info['security_name'].str.replace(' DR$','')

#
col_info['cusip_short'] = col_info['cusip'].astype(str).str[:7]

col_info['cusip_short_f'] = col_info['cusip_short'].astype(str).str.replace('[0-9]$','@')
col_info['cusip_short_f'] = col_info['cusip_short_f'].astype(str).str.replace('[A-Z]$','D')
col_info['cusip_short_f'] = col_info['cusip_short_f'].astype(str).str.replace('@','E')

col_info['type'] = col_info['cusip_short'].astype(str).str[6:7]

col_info = col_info[['security_name', 'crsp_company_key',
                     'cusip', 'cusip_short', 'cusip_short_f', 
                     'permno', 'permco', 'ticker', 'sum']]

In [None]:
col_info[col_info['security_name'].str.contains('DONAL')]

## Replace

In [None]:
col_info = col_info.sort_values(['cusip','permno'])

col_info = col_info.reset_index(drop=True)
col_info = col_info.assign(col=col_info.index)

In [None]:
#col_info['col_n'] = col_info[['cusip_short_f','col']].groupby('cusip_short_f').apply(lambda x : x.head(1).repeat(x.shape[0])).values
col_info['col_n'] = col_info[['cusip_short_f','col']].groupby('cusip_short_f').agg('first')

## Combining securities with same cusip / permno / permco

In [None]:
col_info = col_info.drop(columns='col_old')

In [None]:
col_info.groupby('cusip')

## Inspecting effect of cleaning

### Overview of cleaning the security data by name

In [None]:
col_names = col_info['security_name']
print('Number of securities                           {:<6,d}'.format(col_names.shape[0]))
print('Number of unique securities                    {:<6,d}'.format(col_names.unique().shape[0]))

In [None]:
col_names = col_names.str.replace(' ADR$','')
col_names = col_names.str.replace(' DR$','')
print('Number of unique securities without ORD        {:<6,d}'.format(col_names.unique().shape[0]))

In [None]:
a = col_names[col_names.str.contains('Fund', case = False)]
print('Number of securities with fund in the name              {:<6,d}'.format(a.shape[0]))

### Overview of cleaning by cusip

In [None]:
col_names = col_info['cusip']
print('Number of securities                               {:>10,d}'.format(col_names.shape[0]))
print('Number of unique securities by cusip               {:>10,d}'.format(col_names.unique().shape[0]))
col_names = col_info['cusip_short']    
print('Number of unique securities by short cusip         {:>10,d}'.format(col_names.unique().shape[0]))

col_names = col_info['cusip_short_f']
print('Number of unique securities by short cusip D/E     {:>10,d}'.format(col_names.unique().shape[0]))

### Overview of cleaning by cusip

In [None]:
col_names = col_info['permno']
print('Number of securities                            {:>6,d}'.format(col_names.shape[0]))
print('Number of unique securities by permno           {:>6,d}'.format(col_names.unique().shape[0]))

In [None]:
# Generate sparse holdings matrix with boolean values instead of more precise percent_tna values
holdings_b = sparse.csr_matrix(holdings_ft, copy=True)
holdings_b.data = np.ones(len(holdings_ft.data))

In [None]:
def most_common_stocks():
    # Change if you want to sum percentages instead of boolean values
    holdings_coo = holdings_b.tocoo()

    df_sparse = pd.DataFrame({'row' : holdings_coo.row,
                              'col' : holdings_coo.col,
                              'data' : holdings_coo.data})

    df_sparse = df_sparse.merge(row_info_f[['year','row','lipper_class']],how='left',on='row')

    no_unique_funds = row_info_f.shape[0]

    sum_col = (df_sparse
               .groupby(by = ['col'])
               .sum()
               .sort_values('data',ascending = False)
               .join(col_info_f[['security_name','col','crsp_company_key','cusip','permno','permco']],how='left')
               .assign(percent = lambda x:  x.data / no_unique_funds * 100)
               .drop(columns=['row','year','col'])
               .reset_index(drop=True)
               .head(100))

    print(
        'Most held stocks: ','\n\n'
        'Number of funds:    {}'.format(no_unique_funds),'\n'
    )

    return sum_col

In [None]:
most_common_stocks()

In [None]:
holdings_summary = pd.DataFrame(holdings_ft.sum(0).T)
holdings_summary.columns = ['total']
holdings_summary.shape

## Distribution of cusip and crsp_company_key

In [None]:
col_info.sample()

In [None]:
print('Percentage of identifiers missing:')
np.sum(col_info[['crsp_company_key','cusip','permno','permco']].isna()) / col_info.shape[0] * 100

In [None]:
print('Percentage of stocks with same cusip appearing twice:')
col_info.drop_duplicates(subset='cusip').shape[0] / col_info.shape[0] * 100

In [None]:
print('Percentage of stocks with same SHORT cusip appearing twice:')
col_info['short_cusip'] = col_info['cusip'].astype(str).str[:6]
col_info.drop_duplicates(subset='short_cusip').shape[0] / col_info.shape[0] * 100

In [None]:
col_info

In [None]:
t = 10000
col_info.sort_values('security_name').iloc[t:t+100,:]

In [None]:
col_info[col_info['security_name'].str.contains('ALLERGAN')]

In [None]:
col_info['security_name']

In [None]:
col_info['security_name'].drop_duplicates()

In [None]:
col_info_f.shape

In [None]:
print(np.sum(col_info_f['permco'].isna()))
col_info_f[col_info_f['permco'].isna()]

In [None]:
np.sum(col_info_f['crsp_company_key'].isna())

In [None]:
col_info_f['permco'] = col_info_f['permco'].astype(float)
col_info_f['permno'] = col_info_f['permno'].astype(float)
ccm_link['gvkey'] = ccm_link['gvkey'].astype(float)

In [None]:
ccm_link.head()

In [None]:
ccm_link['gvkey'] = ccm_link['gvkey'].astype(float)

In [None]:
ccm_link_f = ccm_link.dropna(subset = ['lpermno'])

In [None]:
col_info_fm = col_info_f.merge(ccm_link_f[['gvkey','lpermno']], how = 'left', left_on = ['permno'], right_on = ['lpermno'])

In [None]:
ccm_link_f[ccm_link_f['gvkey'] == 5047.0]

In [None]:
col_info_fm

In [None]:
col_info_fm.drop_duplicates().shape