<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#FUNCTIONS---Run-at-start" data-toc-modified-id="FUNCTIONS---Run-at-start-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>FUNCTIONS - Run at start</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Filter-data" data-toc-modified-id="Filter-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Filter data</a></span></li></ul></li><li><span><a href="#Setup-und-Algo" data-toc-modified-id="Setup-und-Algo-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup und Algo</a></span></li><li><span><a href="#Analysis-of-resulting-classifications" data-toc-modified-id="Analysis-of-resulting-classifications-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Analysis of resulting classifications</a></span><ul class="toc-item"><li><span><a href="#Error-volatility" data-toc-modified-id="Error-volatility-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Error volatility</a></span></li><li><span><a href="#Distribution-of-classes-per-classification-technique" data-toc-modified-id="Distribution-of-classes-per-classification-technique-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Distribution of classes per classification technique</a></span></li><li><span><a href="#Transition-tables" data-toc-modified-id="Transition-tables-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Transition tables</a></span></li><li><span><a href="#Comparing-mean-return-per-class-for-the-different-classification-techniques" data-toc-modified-id="Comparing-mean-return-per-class-for-the-different-classification-techniques-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Comparing mean return per class for the different classification techniques</a></span></li></ul></li><li><span><a href="#Sanity-checks" data-toc-modified-id="Sanity-checks-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Sanity checks</a></span><ul class="toc-item"><li><span><a href="#Analysing-individual-portfolios" data-toc-modified-id="Analysing-individual-portfolios-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Analysing individual portfolios</a></span></li><li><span><a href="#Inspecting-individual-nearest-neighbors" data-toc-modified-id="Inspecting-individual-nearest-neighbors-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Inspecting individual nearest neighbors</a></span></li></ul></li><li><span><a href="#Multiprocessing" data-toc-modified-id="Multiprocessing-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Multiprocessing</a></span></li></ul></div>

# Classification

In [1]:
import feather
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product

from scipy import sparse

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

# Progress bar
from ipywidgets import FloatProgress
from IPython.display import display

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

## FUNCTIONS - Run at start

### Load Data

In [2]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

### Filter data 

In [3]:
# Now filter everything
#######################

def filter_data(year):
    row_info_f = row_info.copy()
    row_info_f = row_info_f.query('year == @year')

    begin_date = row_info_f.iloc[0,:]['report_dt']
    end_date = begin_date + pd.DateOffset(years=1,months=1,days = 5)
    row_info_f.reset_index(drop = True, inplace=True)

    # Filter returns
    crsp_fundno_unique = row_info_f['crsp_fundno'].unique()
    returns_f = returns.copy()
    query = '''report_dt >= @begin_date and report_dt <= @end_date and crsp_fundno in @crsp_fundno_unique'''
    returns_f = returns_f.query(query)

    # Change return of month for which holdings apply to 0
    returns_f = returns_f.copy()
    mask = returns_f['report_dt'] == begin_date
    returns_f.loc[mask,'mret'] = 0

    # Filter holdings accordingly and delet all empty columns
    holdings_f = holdings.copy()
    holdings_f = holdings_f[row_info_f['row']]
    col_sums = pd.DataFrame(holdings_f.sum(0).T).values 
    mask = (col_sums != 0).flatten()
    holdings_f = holdings_f[:,mask]
    
    ## Preprocessing
    holdings_ft = normalize(holdings_f)
    
    #print('Numer of unique funds:      {:10,d}'.format(row_info_f.shape[0]))
    #print('Begin date:                 {}'.format(begin_date.date()))
    #print('End date:                   {}'.format(end_date.date()))
    
    return(row_info_f, returns_f, holdings_ft, begin_date, end_date)

In [4]:
def classify(row_info_f, holdings_f, param, verbose=True):
    
    #### Setup ####
    # Classifier
    neigh = KNeighborsClassifier(n_neighbors = param['n_neighbors'].astype(int), 
                                 p           = param['distance_param'].astype(int), 
                                 n_jobs      = 1,
                                 weights     = 'uniform')
    # Data
    X = holdings_f
    n_rows = X.shape[0]
    y = list(row_info_f['lipper_class'].values)
    y_df = pd.Series(y)

    # Result dataframe
    style_df = pd.DataFrame({
        'model_lipper' : y})
    style_df['model_knn_iterative'] = style_df['model_lipper']

    #### Full #### 
    # Predict all at once and save in style_df

    neigh.fit(X,y)    
    style_df.loc[:,'model_knn_full'] = neigh.predict(X)

    #### Iterative ####

    # Index : Setup of index for choosing rows iteratively
    index = np.arange(n_rows)
    np.random.shuffle(index)
    index = np.concatenate((index,index,index,index,index))
    
    n_rows_chosen = round(n_rows * param['perc_rows_used']).astype(int)
    index = index[:n_rows_chosen]
    it = iter(index)
    index = zip(it, it)
    chosen_indices = []

    # Loop over n_iterations, choose one observation randomly, predict label, save and repeat
    f = FloatProgress(min=0, max=n_rows_chosen)
    if(verbose): 
        display(f)

    for i in index:
        mask = np.arange(X.shape[0]) # mask for whole sample
        mask_is = ~np.isin(mask,i)   # mask to choose all in sample observations
        mask_oos = np.isin(mask,i)   # mask to choose the x out of sample observations for which we predict
        chosen_indices.append(i)

        # Mask X and labels to exclude row for which prediction will be made
        X_sub = X[mask_is]
        y_df_sub = style_df.loc[mask_is,'model_knn_iterative'].values.tolist()

        # Fit knn model on all but randomly chosen row
        neigh.fit(X_sub,y_df_sub) 

        # Predict and save label for randomly chosen row
        style_df.loc[mask_oos,'model_knn_iterative'] = neigh.predict(X[mask_oos])
        f.value += 2

    row_chosen = np.unique(np.array(chosen_indices).flatten()).shape[0]
    #print('Rows randomly chosen:    {:4.2f}%'.format(row_chosen / X.shape[0] * 100))
    #print('Done')
    
    return(style_df)

In [5]:
def calc_styleadj_returns(row_info_f,returns_f,style_df):

    row_info_m = row_info_f.copy()
    returns_m = returns_f.copy()

    # concat predicted styles to row_info
    row_info_m = pd.concat([row_info_m,style_df],axis = 1)

    # merge predicted styles onto returns
    returns_m = returns_m.merge(row_info_m[[
                                    'crsp_fundno', 'report_dt', 'model_lipper', 'model_knn_full',
                                    'model_knn_iterative'
                                    ]],
                                    how='left',
                                    on=['crsp_fundno', 'report_dt'])

    # melt the different style columns per model into one (from wide to long)
    returns_m = pd.melt(returns_m,
                                   id_vars=['crsp_fundno', 'report_dt', 'mret'],
                                   value_vars=cols,
                                   var_name='model',
                                   value_name='style')

    # Fill all styles and drop nas

    temp = (returns_m
                                    .groupby(['model','crsp_fundno'])
                                    .apply(lambda x: x.fillna(method = 'ffill'))
    )

    returns_m['style'] = temp['style']
    returns_m = returns_m.dropna()

    # Calc mean return per style
    style_returns = (returns_m
                                    .groupby(['model','style','report_dt'])
                                    .mean()
                                    .reset_index()
                                    .drop(columns='crsp_fundno')
    )

    # Calc cumret per style
    style_returns['cum_ret'] = (style_returns
                                    .assign(cum_ret = lambda x: x.mret + 1)
                                    .groupby(['model','style'])
                                    .apply(lambda x: x['cum_ret'].cumprod())
                                    .reset_index()
                                    ['cum_ret']
    )

    # Merge style returns onto fund returns and calc tracking error
    returns_m = (returns_m
                                    .rename(columns = {'mret' : 'fund_ret'}) 
                                    .merge(style_returns,
                                                how ='left',
                                                on = ['model','style','report_dt'])
                                    .assign(error = lambda df: df['fund_ret'] - df['mret'])
                                    .rename(columns = {'mret' : 'style_ret',
                                                       'cum_ret' : 'style_cum'}) 
    )

    returns_m = returns_m[['crsp_fundno', 'report_dt', 'model', 'style',
                           'fund_ret', 'style_ret', 'style_cum', 'error']]
    
    return(returns_m, style_returns)

In [6]:
def error_vola_deciles(returns_m): 
    error_vol = (returns_m
                 .groupby(['crsp_fundno','model'])['error']
                 .std()
                 .reset_index())
    error_vol['error'] = error_vol['error'] * 100
    
    error_vol = (error_vol
            .groupby('model')[['error']]
            .apply(lambda x : x.quantile(np.round(np.arange(0.1,1,0.1),2)))
            .reset_index()
            .pivot(columns='level_1',values='error',index='model'))
    return(error_vol)

In [7]:
def full_algo(param):
#    param = pd.DataFrame(param_list, columns = ['year','perc_rows_used','distance_param','n_neighbors'])    
    np.random.seed(0)

    row_info_f, returns_f, holdings_f, begin_date, end_date = filter_data(param['year'])
    
    style_df = classify(row_info_f, holdings_f, param, verbose=False)

    returns_m, style_returns = calc_styleadj_returns(row_info_f,returns_f,style_df)

    temp = error_vola_deciles(returns_m)

    temp['year']           = param['year']
    temp['count']          = row_info_f.shape[0]
    temp['perc_rows_used'] = param['perc_rows_used']
    temp['distance_param'] = param['distance_param']
    temp['n_neighbors']    = param['n_neighbors']
    
    return(temp)

In [8]:
def expand_grid(dictionary):
    temp = pd.DataFrame([row for row in product(*dictionary.values())], 
                           columns=dictionary.keys())
    return(temp)

## Setup und Algo

In [9]:
#### Options #####
##################
style_class      = 'lipper_class' # Choose lipper_class, style_class or cap_class
cols             = ['model_lipper',
                    'model_knn_full',
                    'model_knn_iterative'] # Do not change, only names

In [10]:
full = [2010,2011,2012,2013,2014,2015,2016,2017]
param_grid = dict(year           = [2016,2017],
                  perc_rows_used = [1],      # 1: all funds are reclassified once, 2: ... twice, etc.
                  distance_param = [1,2],      # 1: manhattan distance, 2: euclidian distance
                  n_neighbors    = [5,10],      # Number of neighbors to use in k-nn algorithm
                 )

param_grid = expand_grid(param_grid)
param_list = param_grid.values.tolist()

In [11]:
%%time

list_df = []
for i, row in param_grid.iterrows():
    no_rows = param_grid.shape[0]
    params = row
    
    temp = full_algo(params)
    list_df.append(temp)
    
    print('Finished model nr:  {} - {:6.2f}%'.format(i+1, (i+1) / no_rows * 100))
    
results = pd.concat(list_df)

Finished model nr:  1 -  12.50%
Finished model nr:  2 -  25.00%
Finished model nr:  3 -  37.50%
Finished model nr:  4 -  50.00%
Finished model nr:  5 -  62.50%
Finished model nr:  6 -  75.00%
Finished model nr:  7 -  87.50%
Finished model nr:  8 - 100.00%
CPU times: user 2min 29s, sys: 2.56 s, total: 2min 32s
Wall time: 2min 31s


In [14]:
results.query(''' model == 'model_knn_iterative' ''')

level_1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,year,count,perc_rows_used,distance_param,n_neighbors
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
model_knn_iterative,0.579172,0.702454,0.846563,0.954365,1.073291,1.22836,1.423165,1.703982,2.190481,2016,1922,1,1,5
model_knn_iterative,0.598628,0.751369,0.890174,1.008805,1.158555,1.299374,1.489972,1.778203,2.243525,2016,1922,1,1,10
model_knn_iterative,0.519595,0.636672,0.741081,0.82265,0.922371,1.040916,1.206072,1.419449,1.832776,2016,1922,1,2,5
model_knn_iterative,0.506521,0.625734,0.73113,0.812776,0.913696,1.041587,1.188808,1.376832,1.809036,2016,1922,1,2,10
model_knn_iterative,0.584785,0.739715,0.892977,0.990125,1.088547,1.195278,1.339054,1.543127,1.86472,2017,1819,1,1,5
model_knn_iterative,0.615481,0.801746,0.953468,1.043319,1.137766,1.245076,1.367628,1.556924,1.873986,2017,1819,1,1,10
model_knn_iterative,0.502842,0.614457,0.703502,0.799158,0.879259,0.976466,1.106778,1.282233,1.618285,2017,1819,1,2,5
model_knn_iterative,0.487451,0.60407,0.688007,0.780615,0.861954,0.960559,1.096075,1.268455,1.60393,2017,1819,1,2,10


In [None]:
deciles = results.iloc[:,0:9]
deciles = deciles.reset_index(drop = True)

params_deciles = results.iloc[:,9:]
params_deciles = params_deciles.reset_index()

weights = (params_deciles[['year','count']]
                                  .drop_duplicates()
                                  .assign(weight = lambda x: x['count'] / np.sum(x['count'])))
weights = weights[['weight']].values

params_deciles['param_id'] = (params_deciles
                                  .assign(model_id = np.nan)
                                  .groupby(['model','perc_rows_used', 'distance_param', 'n_neighbors'])
                                  .ngroup())

deciles = deciles.groupby(params_deciles['param_id']).apply(lambda x: np.sum(x * weights))

params_deciles = (params_deciles
                      .drop_duplicates(['model','perc_rows_used', 'distance_param', 'n_neighbors'])
                      .drop(columns = ['year','count']))

pd.concat([params_deciles,deciles],axis = 1)

## Analysis of resulting classifications

### Error volatility

In [None]:
error_vol = (returns_m
                 .groupby(['crsp_fundno','model'])['error']
                 .std()
                 .reset_index())

error_vol['error'] = error_vol['error'] * 100
error_vol.groupby('model')['error'].describe()

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
for i, col in enumerate(cols):
    ax = error_vol.query(''' model == @col and error < 5''')['error'].hist(label = col,bins = 100, alpha = 0.5)

ax.set_xlim(-1,5)    
fig.legend()
plt.show()

### Distribution of classes per classification technique

In [None]:
print('Overlap between Lipper class and: \n')
print('Knn full prediction:         {:2.2f}%'
      .format(np.sum(style_df['model_lipper'] == style_df['model_knn_full']) / len(style_df.index) * 100))
print('Knn iterative prediction:    {:2.2f}%'
      .format(np.sum(style_df['model_lipper'] == style_df['model_knn_iterative']) / len(style_df.index) * 100))

data = style_df.apply(pd.Series.value_counts, normalize = True)
data = data.assign(style = data.index)
data = data.melt(id_vars = 'style', value_vars = data.columns[:-1])

sns.set()
plt.figure(figsize=(18,10))
g = sns.barplot(data = data, y = 'style', x = 'value', hue = 'variable')

plt.title('Style distribution')
plt.ylabel('Style')
plt.xlabel('Percentage')

plt.show()

### Transition tables

In [None]:
round(pd.crosstab(style_df['model_lipper'], style_df['model_knn_iterative'], margins=True, normalize='all') * 100,2)

### Comparing mean return per class for the different classification techniques

In [None]:
fig, ax = plt.subplots(figsize=(20,5),ncols=3, sharey='row')
for i, col in enumerate(cols):
    sns.lineplot(data = style_returns.query(''' model == @col '''),
                 x='report_dt', y='cum_ret', hue='style', ax=ax[i])

# Subplot titles
title = cols
ax[0].set_ylabel('Cumulative return per class')

for i in range(0,3):
    ax[i].set_title(title[i], fontsize = 16)
    ax[i].set_xlabel('')
    for label in ax[i].get_xticklabels():
        label.set_rotation(45)
        
for i in range(1,3):
    ax[i].get_legend().remove()

plt.show()

## Sanity checks

### Analysing individual portfolios

In [None]:
comparing_df.query('''true == 'V' and iterative_5 == 'G' ''').head()

In [None]:
crsp_fundno = 18307
most_common_stocks_fund(year=2017, crsp_fundno=crsp_fundno)

In [None]:
def most_common_stocks_fund(crsp_fundno,row_info,year):
    "This prints a passed string into this function"
    # Enter date for which most common holdings are calculated
    year = year
    crsp_fundno = crsp_fundno
    row_info_l = row_info

    holdings_coo = holdings.tocoo()

    df_sparse = pd.DataFrame({'row'  : holdings_coo.row,
                              'col'  : holdings_coo.col,
                              'data' : holdings_coo.data})

    df_sparse = df_sparse.merge(row_info_l[['year','row','crsp_fundno']],how='left',on='row')
    my_filter = '''year == @year and crsp_fundno == @crsp_fundno '''
    no_unique_funds = row_info_l.query(my_filter).shape[0]

    sum_col = (df_sparse
               .query(my_filter)
               .groupby(by = ['col'])
               .mean()
               .sort_values('data',ascending = False)
               .join(col_info[['security_name','col']],how='left')
               .assign(percent = lambda x:  x.data)
               .drop(columns=['row','data','col','year','crsp_fundno'])
               .reset_index(drop=True)
               .head(10))
    
    print(
        'Average of most held stocks for one fund in one year: ','\n\n'
        '{}'.format(row_info.query('crsp_fundno == @crsp_fundno').iloc[0,2]),'\n\n'
        'crsp_fundno:                            {}'.format(crsp_fundno),'\n'
        'Number of observations in that year:    {}'.format(no_unique_funds))

    return sum_col

### Inspecting individual nearest neighbors

In [None]:
neigh.kneighbors(X[1234],n_neighbors = n_neighbors)

In [None]:
def name_nearestneighbors(row_info,neigh,i,n_neighbors = 5):
    print('Name:')
    print(row_info.loc[i].fund_name)
    print(row_info.loc[i].crsp_fundno)
    print('\nNearest Neighbors:')
    nn_index = neigh.kneighbors(X[i],n_neighbors = n_neighbors)[1].flatten()
    nn_names = row_info.loc[nn_index].fund_name.values
    nn_fundno = row_info.loc[nn_index].crsp_fundno.values
    
    for name in nn_names[1:]:
        print(name)

In [None]:
name_nearestneighbors(row_info,neigh,i = 1234, n_neighbors = 5)

In [None]:
crsp_fundno = 36608
most_common_stocks_fund(crsp_fundno=crsp_fundno)

In [None]:
crsp_fundno = 3690
most_common_stocks_fund(crsp_fundno=crsp_fundno)

## Multiprocessing

In [None]:
for row in param_list:
    a = zip(row)

In [None]:

import multiprocessing

pool = multiprocessing.Pool(processes=4)

result_list = []
result_list = pool.starmap(full_algo, a)
