<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#FUNCTIONS---Run-at-start" data-toc-modified-id="FUNCTIONS---Run-at-start-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>FUNCTIONS - Run at start</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Filter" data-toc-modified-id="Filter-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Filter</a></span></li><li><span><a href="#Classify" data-toc-modified-id="Classify-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Classify</a></span></li></ul></li><li><span><a href="#Setup-und-Algo" data-toc-modified-id="Setup-und-Algo-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup und Algo</a></span><ul class="toc-item"><li><span><a href="#Multiprocessing" data-toc-modified-id="Multiprocessing-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Multiprocessing</a></span></li></ul></li><li><span><a href="#Analysis-of-resulting-classifications" data-toc-modified-id="Analysis-of-resulting-classifications-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Analysis of resulting classifications</a></span><ul class="toc-item"><li><span><a href="#Error-volatility" data-toc-modified-id="Error-volatility-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Error volatility</a></span></li><li><span><a href="#Distribution-of-classes-per-classification-technique" data-toc-modified-id="Distribution-of-classes-per-classification-technique-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Distribution of classes per classification technique</a></span></li><li><span><a href="#Transition-tables" data-toc-modified-id="Transition-tables-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Transition tables</a></span></li><li><span><a href="#Comparing-mean-return-per-class-for-the-different-classification-techniques" data-toc-modified-id="Comparing-mean-return-per-class-for-the-different-classification-techniques-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Comparing mean return per class for the different classification techniques</a></span></li></ul></li><li><span><a href="#Sanity-checks" data-toc-modified-id="Sanity-checks-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Sanity checks</a></span><ul class="toc-item"><li><span><a href="#Analysing-individual-portfolios" data-toc-modified-id="Analysing-individual-portfolios-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Analysing individual portfolios</a></span></li><li><span><a href="#Inspecting-individual-nearest-neighbors" data-toc-modified-id="Inspecting-individual-nearest-neighbors-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Inspecting individual nearest neighbors</a></span></li><li><span><a href="#Tests" data-toc-modified-id="Tests-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Tests</a></span></li></ul></li></ul></div>

# Classification 

In [None]:
import feather
import numpy as np
import pandas as pd
from scipy import sparse

import seaborn as sns
import matplotlib.pyplot as plt

from multiprocessing import Pool
from itertools import product

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error

# Progress bar
from ipywidgets import FloatProgress
from IPython.display import display

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

import logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s - %(message)s')

## FUNCTIONS - Run at start

### Load Data

In [None]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)
returns = returns.rename(columns = {'caldt' : 'report_dt'})
returns = returns.assign(year = returns['report_dt'].dt.year)

### row_info
path = '../data/processed/row_info_f.feather'
row_info = feather.read_dataframe(path)

### col_info
path = '../data/processed/col_info_f.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings_f.npz'
holdings = sparse.load_npz(path)

### Filter

In [None]:
def filter_data(param, verbose = False):
    year = param.loc[0,'year']
    if param.loc[0,'preprocessing'] == 1:
        preprocessing = 'l1'
    if param.loc[0,'preprocessing'] == 2:
        preprocessing = 'l2'
        
    row_info_f = row_info.copy()
    if (year != 'full'):    # If year = full take whole sample
        row_info_f = row_info_f.query('year == @year')

    begin_date = row_info_f.iloc[0,:]['report_dt']
    end_date = begin_date + pd.DateOffset(years=1) # 1 year offset
    row_info_f.reset_index(drop = True, inplace=True)

    # Filter returns
    crsp_fundno_unique = row_info_f['crsp_fundno'].unique()
    returns_f = returns.copy()
    query = '''report_dt >= @begin_date and report_dt <= @end_date and crsp_fundno in @crsp_fundno_unique'''
    returns_f = returns_f.query(query)

    # Change return of month for which holdings apply to 0
    returns_f = returns_f.copy()
    mask = returns_f['report_dt'] == begin_date
    returns_f.loc[mask,'mret'] = 0
    
    # Drop all funds with first return observation after starting date
    drop_fundnos = returns_f.drop_duplicates('crsp_fundno').query('mret != 0')['crsp_fundno']
    returns_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    row_info_f.query('crsp_fundno not in @drop_fundnos', inplace=True)
    
    # Filter holdings accordingly and delet all securities with less than two observations
    holdings_f = holdings.copy()
    holdings_f = holdings[row_info_f['row']]
    
    holdings_b = sparse.csr_matrix(holdings_f, copy=True)
    holdings_b.data = np.ones(len(holdings_f.data))

    sum_sec_boolean = holdings_b.toarray().sum(0)
    col_mask = (sum_sec_boolean >= 2).flatten()

    holdings_f = holdings_f.tocsc()
    holdings_f = holdings_f[:,col_mask]
    holdings_f = holdings_f.tocsr()
    
    ## Preprocessing
    if (preprocessing == 'none'): holdings_ft = holdings_f
    if (preprocessing == 'l1'):   holdings_ft = normalize(holdings_f, norm = 'l1')
    if (preprocessing == 'l2'):   holdings_ft = normalize(holdings_f, norm = 'l2')
    
    if (verbose):
        print('Numer of unique funds:           {:10,d}'.format(row_info_f.shape[0]))
        print('Numer of unique securities:      {:10,d}'.format(holdings_ft.shape[1]))
        print('Begin date:                      {}'.format(begin_date.date()))
        print('End date:                        {}'.format(end_date.date()))
    
    return(row_info_f, returns_f, holdings_ft, begin_date, end_date)

### Classify

In [None]:
def classify(row_info_f, holdings_f, param, verbose=False):
    np.random.seed(1)
    
    if param.loc[0,'weights'] == 1:
        weights = 'distance'
    if param.loc[0,'weights'] == 2:
        weights = 'uniform'

    if param.loc[0,'preprocessing'] == 1:
        preprocessing = 'l1'
    if param.loc[0,'preprocessing'] == 2:
        preprocessing = 'l2'
        
    #### Setup ####
    # Classifier
    neigh = KNeighborsClassifier(n_neighbors = param.loc[0,'n_neighbors'].astype(int), 
                                 p           = param.loc[0,'distance_param'].astype(int), 
                                 #metric      = 'hamming'
                                 weights     = weights, 
                                 n_jobs      = 1) # distance or uniform
    
    # Data
    X = holdings_f
    y = row_info_f['lipper_class'].values
    n_rows = holdings_f.shape[0]

    # Result dataframe
    style_df = pd.DataFrame({'model_lipper' : y})

    #### Full #### 
    # Predict all at once and save in style_df
    loo = LeaveOneOut()
    loo.get_n_splits(X)

    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        neigh.fit(X_train,y_train)
        style_df.loc[test_index, 'model_knn_loo'] = neigh.predict(X_test)

    #### Iterative ####
    style_df['model_knn_iterative'] = style_df['model_lipper']
    
    # Index : Setup of index for choosing rows iteratively
    index = np.arange(n_rows)
    np.random.shuffle(index)
    index = np.concatenate((index,index,index,index,index))
    
    n_rows_chosen = round(n_rows * param.loc[0,'perc_rows_used']).astype(int)
    index = index[:n_rows_chosen]
    it = iter(index)
    index = zip(it)
    chosen_indices = []

    # Loop over n_iterations, choose one observation randomly, predict label, save and repeat
    f = FloatProgress(min=0, max=n_rows_chosen)
    if(verbose): 
        display(f)

    for i in index:
        mask = np.arange(X.shape[0]) # mask for whole sample
        mask_is = ~np.isin(mask,i)   # mask to choose all in sample observations
        mask_oos = np.isin(mask,i)   # mask to choose the x out of sample observations for which we predict
        chosen_indices.append(i)

        # Mask X and labels to exclude row for which prediction will be made
        X_sub = X[mask_is]
        y_sub = style_df.loc[mask_is,'model_knn_iterative'].values

        # Fit knn model on all but randomly chosen row
        neigh.fit(X_sub,y_sub) 

        # Predict and save label for randomly chosen row
        style_df.loc[mask_oos,'model_knn_iterative'] = neigh.predict(X[mask_oos])
        f.value += 2

    row_chosen = np.unique(np.array(chosen_indices).flatten()).shape[0]
    #print('Rows randomly chosen:    {:4.2f}%'.format(row_chosen / X.shape[0] * 100))
    #print('Done')
    
    return(style_df)

In [None]:
def calc_styleadj_returns(row_info_f,returns_f,style_df):

    row_info_m = row_info_f.copy()
    returns_m = returns_f.copy()

    # concat predicted styles to row_info
    row_info_m = pd.concat([row_info_m,style_df],axis = 1)

    # merge predicted styles onto returns
    returns_m = returns_m.merge(row_info_m[[
                                    'crsp_fundno', 'report_dt', 'model_lipper', 'model_knn_loo',
                                    'model_knn_iterative'
                                    ]],
                                    how='left',
                                    on=['crsp_fundno', 'report_dt'])

    # melt the different style columns per model into one (from wide to long)
    returns_m = pd.melt(returns_m,
                                   id_vars=['crsp_fundno', 'report_dt', 'mret'],
                                   value_vars=cols,
                                   var_name='model',
                                   value_name='style')

    # Fill all styles and drop nas

    temp = (returns_m
                                    .groupby(['model','crsp_fundno'])
                                    .apply(lambda x: x.fillna(method = 'ffill'))
    )

    returns_m['style'] = temp['style']
    returns_m = returns_m.dropna()

    # Calc mean return per style
    style_returns = (returns_m
                                    .groupby(['model','style','report_dt'])
                                    .mean()
                                    .reset_index()
                                    .drop(columns='crsp_fundno')
    )

    # Calc cumret per style
    style_returns['cum_ret'] = (style_returns
                                    .assign(cum_ret = lambda x: x.mret + 1)
                                    .groupby(['model','style'])
                                    .apply(lambda x: x['cum_ret'].cumprod())
                                    .reset_index()
                                    ['cum_ret']
    )

    # Merge style returns onto fund returns and calc tracking error
    returns_m = (returns_m
                                    .rename(columns = {'mret' : 'fund_ret'}) 
                                    .merge(style_returns,
                                                how = 'left',
                                                on = ['model','style','report_dt'])
                                    .assign(error = lambda df: df['fund_ret'] - df['mret'])
                                    .rename(columns = {'mret' : 'style_ret',
                                                       'cum_ret' : 'style_cum'}) 
    )

    returns_m = returns_m[['crsp_fundno', 'report_dt', 'model', 'style',
                           'fund_ret', 'style_ret', 'style_cum', 'error']]
    
    return(returns_m, style_returns)

In [None]:
def error_vola_deciles(returns_m): 
    error_vol = (returns_m
                 .groupby(['crsp_fundno','model'])['error']
                 .std()
                 .reset_index())
    error_vol['error'] = error_vol['error'] * 100
    
    error_vol = (error_vol
                .groupby('model')[['error']]
                .apply(lambda x : x.quantile(np.round(np.arange(0.1,1,0.1),2)))
                .reset_index()
                .pivot(columns='level_1',values='error',index='model'))
    return(error_vol)

In [None]:
def weighted_average_score_new(param_grid, relevant_params, measures):

    param_grid.reset_index()
    param_grid = param_grid.fillna(value=0)
    param_grid['param_id'] = (param_grid
                                  .groupby(relevant_params)
                                  .ngroup())
    
    # Fix for issue with same param_id for lipper rows
    lipper_rows = param_grid.loc[param_grid['model'] == 'lipper',:].copy()
    lipper_rows['param_id'] = lipper_rows.groupby(['preprocessing']).ngroup()
    lipper_rows['param_id'] = (lipper_rows['param_id'] + 1) * -1
    param_grid.loc[param_grid['model'] == 'lipper'] = lipper_rows

    scores = param_grid[measures]
    params_only = param_grid.drop(columns = measures)
    
    weights = (param_grid[['year','count']]
                                  .drop_duplicates()
                                  .assign(weight = lambda x: x['count'] / np.sum(x['count'])))
    weights = weights[['weight']].values

    scores = scores.groupby(params_only['param_id']).apply(lambda x: np.sum(x * weights))

    params_only = (params_only
                      .drop_duplicates(relevant_params)
                      .drop(columns = ['year','count']))

    result = params_only.merge(scores, how = 'left', on = 'param_id')
    
    return(result)

In [None]:
def weighted_average_score(results): 
    deciles = results.iloc[:,0:9]
    deciles = deciles.reset_index(drop = True)

    param_grid = results.iloc[:,9:]
    param_grid = param_grid.reset_index()

    weights = (param_grid[['year','count']]
                                  .drop_duplicates()
                                  .assign(weight = lambda x: x['count'] / np.sum(x['count'])))
    weights = weights[['weight']].values

    param_grid['param_id'] = (param_grid
                                  .groupby(['model','perc_rows_used', 'distance_param', 'n_neighbors', 'weights'])
                                  .ngroup())
    
    deciles = deciles.groupby(param_grid['param_id']).apply(lambda x: np.sum(x * weights))


    param_grid = (param_grid
                      .drop_duplicates(['model','perc_rows_used', 'distance_param', 'n_neighbors', 'weights'])
                      .drop(columns = ['year','count']))

    result = param_grid.merge(deciles, how = 'left', on = 'param_id')
    
    return(result)

In [None]:
def full_algo(year,perc_rows_used,distance_param,n_neighbors,weights,preprocessing):
        
    param = pd.DataFrame(np.array([[year, perc_rows_used, distance_param, n_neighbors, weights, preprocessing]]),
                   columns = ['year','perc_rows_used','distance_param','n_neighbors', 'weights', 'preprocessing'])
    
    if param.loc[0,'weights'] == 1:
        weights = 'distance'
    if param.loc[0,'weights'] == 2:
        weights = 'uniform'
        
    if param.loc[0,'preprocessing'] == 1:
        preprocessing = 'l1'
    if param.loc[0,'preprocessing'] == 2:
        preprocessing = 'l2'
    
    np.random.seed(0)
    
    row_info_f, returns_f, holdings_f, begin_date, end_date = filter_data(param)
    logging.debug('Data loaded and filtered')
    
    style_df = classify(row_info_f, holdings_f, param, verbose=False)
    logging.debug('Classification complete')
    
    returns_m, style_returns = calc_styleadj_returns(row_info_f, returns_f, style_df)
    logging.debug('Style calculated')
    
    temp = error_vola_deciles(returns_m)
    logging.debug('Deciles calculated')
    logging.info('Algo completed')
    
    temp['year']           = param.loc[0,'year']
    temp['count']          = row_info_f.shape[0]
    temp['perc_rows_used'] = param.loc[0,'perc_rows_used']
    temp['distance_param'] = param.loc[0,'distance_param']
    temp['n_neighbors']    = param.loc[0,'n_neighbors']
    temp['weights']        = weights
    temp['preprocessing']  = preprocessing
    
    return(temp)

In [None]:
def expand_grid(dictionary):
    temp = pd.DataFrame([row for row in product(*dictionary.values())], 
                           columns=dictionary.keys())
    return(temp)

## Setup und Algo

In [None]:
#### Options #####
##################
style_class      = 'lipper_class' # Choose lipper_class, style_class or cap_class
cols             = ['model_lipper',
                    'model_knn_loo',
                    'model_knn_iterative'] # Do not change, only names

In [None]:
full = [2010,2011,2012,2013,2014,2015]
param_grid = dict(year           = [2010], # Integer or string 'full' for the whole sample
                  perc_rows_used = [1],         # 1: all funds are reclassified once, 2: ... twice, etc.
                  distance_param = [2],       # 1: manhattan distance, 2: euclidian distance
                  n_neighbors    = [10],        # Number of neighbors to use in k-nn algorithm
                  weights        = [1],          # One of 1: (distance) or 2: (uniform)
                  preprocessing  = [2]       # One of 'l1', 'l2', 'none'
                 )

param_grid = expand_grid(param_grid)
param_tuples = list(param_grid.itertuples(index=False,name=None))

print('Number of iterations:       {}'.format(param_grid.shape[0]))

### Multiprocessing

In [None]:
%%time
with Pool() as pool:
    results = pool.starmap(full_algo, param_tuples)
results = pd.concat(results)
print('Finished')

In [None]:
results

In [None]:
data = weighted_average_score(results).sort_values(0.5).drop(columns='param_id')
data_graph = data.query('''distance_param == 2 and weights == 'distance' ''')

In [None]:
data.drop_duplicates(['model',0.5])

In [None]:
sns.lineplot(data = data, y = 0.5, x = 'n_neighbors', hue = 'model')

## Analysis of resulting classifications

### Error volatility

In [None]:
full = [2010,2011,2012,2013,2014,2015,2016]
param_grid = dict(year           = [2017],     # Integer or string 'full' for the whole sample
                  perc_rows_used = [3],          # 1: all funds are reclassified once, 2: ... twice, etc.
                  distance_param = [1],          # 1: manhattan distance, 2: euclidian distance
                  n_neighbors    = [10],         # Number of neighbors to use in k-nn algorithm
                  weights        = [1]         # One or 1 (distance) or 2 (uniform)
                 )

param = expand_grid(param_grid)

In [None]:
row_info_f, returns_f, holdings_f, begin_date, end_date = filter_data(param)

style_df = classify(row_info_f, holdings_f, param, verbose=False)

returns_m, style_returns = calc_styleadj_returns(row_info_f, returns_f, style_df)

temp = error_vola_deciles(returns_m)

In [None]:
error_vol = (returns_m
                 .groupby(['crsp_fundno','model'])['error']
                 .std()
                 .reset_index())

error_vol['error'] = error_vol['error'] * 100
error_vol.groupby('model')['error'].describe()

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
for i, col in enumerate(cols):
    ax = error_vol.query(''' model == @col and error < 3''')['error'].hist(label = col,bins = 100, alpha = 0.5)

ax.set_xlim(0,3)    
fig.legend()
plt.show()

### Distribution of classes per classification technique

In [None]:
print('Overlap between Lipper class and: \n')
print('Knn full prediction:         {:2.2f}%'
      .format(np.sum(style_df['model_lipper'] == style_df['model_knn_loo']) / len(style_df.index) * 100))
print('Knn iterative prediction:    {:2.2f}%'
      .format(np.sum(style_df['model_lipper'] == style_df['model_knn_iterative']) / len(style_df.index) * 100))

data = style_df.apply(pd.Series.value_counts, normalize = True)
data = data.assign(style = data.index)
data = data.melt(id_vars = 'style', value_vars = data.columns[:-1])

sns.set()
plt.figure(figsize=(18,10))
g = sns.barplot(data = data, y = 'style', x = 'value', hue = 'variable')

plt.title('Style distribution')
plt.ylabel('Style')
plt.xlabel('Percentage')

plt.show()

### Transition tables

In [None]:
round(pd.crosstab(style_df['model_lipper'], style_df['model_knn_iterative'], margins=True, normalize='all') * 100,1)

### Comparing mean return per class for the different classification techniques

In [None]:
fig, ax = plt.subplots(figsize=(20,5),ncols=3, sharey='row')
for i, col in enumerate(cols):
    sns.lineplot(data = style_returns.query(''' model == @col '''),
                 x='report_dt', y='cum_ret', hue='style', ax=ax[i])

# Subplot titles
title = cols
ax[0].set_ylabel('Cumulative return per class')

for i in range(0,3):
    ax[i].set_title(title[i], fontsize = 16)
    ax[i].set_xlabel('')
    for label in ax[i].get_xticklabels():
        label.set_rotation(45)
        
for i in range(1,3):
    ax[i].get_legend().remove()

plt.show()

## Sanity checks

### Analysing individual portfolios

In [None]:
comparing_df.query('''true == 'V' and iterative_5 == 'G' ''').head()

In [None]:
crsp_fundno = 18307
most_common_stocks_fund(year=2017, crsp_fundno=crsp_fundno)

In [None]:
def most_common_stocks_fund(crsp_fundno,row_info,year):
    "This prints a passed string into this function"
    # Enter date for which most common holdings are calculated
    year = year
    crsp_fundno = crsp_fundno
    row_info_l = row_info

    holdings_coo = holdings.tocoo()

    df_sparse = pd.DataFrame({'row'  : holdings_coo.row,
                              'col'  : holdings_coo.col,
                              'data' : holdings_coo.data})

    df_sparse = df_sparse.merge(row_info_l[['year','row','crsp_fundno']],how='left',on='row')
    my_filter = '''year == @year and crsp_fundno == @crsp_fundno '''
    no_unique_funds = row_info_l.query(my_filter).shape[0]

    sum_col = (df_sparse
               .query(my_filter)
               .groupby(by = ['col'])
               .mean()
               .sort_values('data',ascending = False)
               .join(col_info[['security_name','col']],how='left')
               .assign(percent = lambda x:  x.data)
               .drop(columns=['row','data','col','year','crsp_fundno'])
               .reset_index(drop=True)
               .head(10))
    
    print(
        'Average of most held stocks for one fund in one year: ','\n\n'
        '{}'.format(row_info.query('crsp_fundno == @crsp_fundno').iloc[0,2]),'\n\n'
        'crsp_fundno:                            {}'.format(crsp_fundno),'\n'
        'Number of observations in that year:    {}'.format(no_unique_funds))

    return sum_col

### Inspecting individual nearest neighbors

In [None]:
neigh.kneighbors(X[1234],n_neighbors = n_neighbors)

In [None]:
def name_nearestneighbors(row_info,neigh,i,n_neighbors = 5):
    print('Name:')
    print(row_info.loc[i].fund_name)
    print(row_info.loc[i].crsp_fundno)
    print('\nNearest Neighbors:')
    nn_index = neigh.kneighbors(X[i],n_neighbors = n_neighbors)[1].flatten()
    nn_names = row_info.loc[nn_index].fund_name.values
    nn_fundno = row_info.loc[nn_index].crsp_fundno.values
    
    for name in nn_names[1:]:
        print(name)

In [None]:
name_nearestneighbors(row_info,neigh,i = 1234, n_neighbors = 5)

In [None]:
crsp_fundno = 36608
most_common_stocks_fund(crsp_fundno=crsp_fundno)

In [None]:
crsp_fundno = 3690
most_common_stocks_fund(crsp_fundno=crsp_fundno)

### Tests

In [None]:
row_info_m = row_info_f.copy()
returns_m = returns_f.copy()

In [None]:
# concat predicted styles to row_info
row_info_m = pd.concat([row_info_m,style_df],axis = 1)

In [None]:
# merge predicted styles onto returns
returns_m = returns_m.merge(row_info_m[[
                                'crsp_fundno', 'report_dt', 'model_lipper', 'model_knn_full',
                                'model_knn_iterative'
                                ]],
                                how='left',
                                on=['crsp_fundno', 'report_dt'])

In [None]:
row_info_m.shape

In [None]:
count = returns_m.groupby(['crsp_fundno']).count()['report_dt']

count.value_counts()

count.sort_values().head(40)

In [None]:
row_info_m.query('''  crsp_fundno == 29626 ''')

In [None]:
# melt the different style columns per model into one (from wide to long)
returns_m = pd.melt(returns_m,
                               id_vars=['crsp_fundno', 'report_dt', 'mret'],
                               value_vars=cols,
                               var_name='model',
                               value_name='style')

In [None]:
# Fill all styles and drop nas
temp = (returns_m
                                .groupby(['model','crsp_fundno'])
                                .apply(lambda x: x.fillna(method = 'ffill'))
)

returns_m['style'] = temp['style']

In [None]:
returns_m.query('''crsp_fundno == 29626''')

In [None]:
returns_m.shape

In [None]:
returns_m = returns_m.dropna()

In [None]:
# Calc mean return per style
style_returns = (returns_m
                                .groupby(['model','style','report_dt'])
                                .mean()
                                .reset_index()
                                .drop(columns='crsp_fundno')
)

In [None]:
    # Calc cumret per style
    style_returns['cum_ret'] = (style_returns
                                    .assign(cum_ret = lambda x: x.mret + 1)
                                    .groupby(['model','style'])
                                    .apply(lambda x: x['cum_ret'].cumprod())
                                    .reset_index()
                                    ['cum_ret']
    )

In [None]:
    # Merge style returns onto fund returns and calc tracking error
    returns_m = (returns_m
                                    .rename(columns = {'mret' : 'fund_ret'}) 
                                    .merge(style_returns,
                                                how = 'left',
                                                on = ['model','style','report_dt'])
                                    .assign(error = lambda df: df['fund_ret'] - df['mret'])
                                    .rename(columns = {'mret' : 'style_ret',
                                                       'cum_ret' : 'style_cum'}) 
    )

In [None]:
    returns_m = returns_m[['crsp_fundno', 'report_dt', 'model', 'style',
                           'fund_ret', 'style_ret', 'style_cum', 'error']]