# Classification

## Description

In [83]:
import feather
import numpy as np
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix, classification_report, SCORERS

import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Setup

In [159]:
# Convert to date format
begin_date = '2017-01-01' 
end_date = '2018-01-01'

## Load Data

In [160]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)

returns = returns.query('caldt >= @begin_date and caldt <= @end_date')

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)

row_info = row_info.query('report_dt >= @begin_date and report_dt <= @end_date')

row_info.reset_index(inplace=True,drop=True)
row_info['row'] = row_info.index

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

holdings = holdings[row_info.row.values]

print('Shape of row_info information')
print(row_info.shape)
print('Shape of holding information')
print(holdings.shape)

Shape of row_info information
(3685, 8)
Shape of holding information
(3685, 48022)


In [161]:
## Knn 

In [162]:
X = holdings
y = list(row_info['lipper_class'].values)

In [163]:
comparing_df = pd.DataFrame({
    'true' : y,
    'predicted' : np.nan
})

In [164]:
comparing_df.head()

Unnamed: 0,true,predicted
0,LCVE,
1,LCVE,
2,LCVE,
3,LCVE,
4,MCCE,


In [165]:
print(X.shape)
print(len(y))

(3685, 48022)
3685


In [91]:
neigh = KNeighborsClassifier(n_neighbors=30, n_jobs = -1)
neigh.fit(holdings,y) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
                     weights='uniform')

In [92]:
comparing_df.predicted = neigh.predict(X)

In [93]:
round(pd.crosstab(comparing_df.true, comparing_df.predicted, margins=True, normalize='all') * 100,2)

predicted,EIEI,LCCE,LCGE,LCVE,MCCE,MCGE,MCVE,MLCE,MLGE,MLVE,SCCE,SCGE,SCVE,All
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EIEI,1.98,2.61,0.03,0.49,0.0,0.0,0.0,1.06,0.0,0.08,0.9,0.08,0.0,7.22
LCCE,0.08,11.23,0.22,0.22,0.05,0.0,0.0,0.24,0.0,0.0,0.41,0.0,0.0,12.46
LCGE,0.0,1.95,9.31,0.0,0.0,0.0,0.0,0.03,0.11,0.0,0.11,0.0,0.0,11.51
LCVE,0.9,1.82,0.0,3.55,0.0,0.0,0.0,0.49,0.0,0.65,0.11,0.0,0.0,7.52
MCCE,0.0,0.0,0.0,0.0,0.87,0.0,0.05,0.33,0.0,0.0,3.8,0.0,0.0,5.05
MCGE,0.03,0.0,0.0,0.0,0.0,3.91,0.0,0.03,0.0,0.0,2.23,0.24,0.0,6.43
MCVE,0.0,0.0,0.0,0.0,0.14,0.0,0.14,0.08,0.0,0.0,1.79,0.0,0.11,2.25
MLCE,0.08,3.83,0.03,0.05,0.05,0.0,0.0,3.8,0.03,0.03,1.98,0.0,0.0,9.88
MLGE,0.0,1.49,3.23,0.0,0.0,1.22,0.0,1.11,0.22,0.0,1.17,0.05,0.0,8.49
MLVE,0.16,0.54,0.03,0.41,0.0,0.0,0.03,1.0,0.0,1.79,0.62,0.0,0.0,4.59


In [94]:
pd.crosstab(comparing_df.true, comparing_df.predicted, margins=True)

predicted,EIEI,LCCE,LCGE,LCVE,MCCE,MCGE,MCVE,MLCE,MLGE,MLVE,SCCE,SCGE,SCVE,All
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EIEI,73,96,1,18,0,0,0,39,0,3,33,3,0,266
LCCE,3,414,8,8,2,0,0,9,0,0,15,0,0,459
LCGE,0,72,343,0,0,0,0,1,4,0,4,0,0,424
LCVE,33,67,0,131,0,0,0,18,0,24,4,0,0,277
MCCE,0,0,0,0,32,0,2,12,0,0,140,0,0,186
MCGE,1,0,0,0,0,144,0,1,0,0,82,9,0,237
MCVE,0,0,0,0,5,0,5,3,0,0,66,0,4,83
MLCE,3,141,1,2,2,0,0,140,1,1,73,0,0,364
MLGE,0,55,119,0,0,45,0,41,8,0,43,2,0,313
MLVE,6,20,1,15,0,0,1,37,0,66,23,0,0,169


In [95]:
print(classification_report(comparing_df.true, comparing_df.predicted))

              precision    recall  f1-score   support

        EIEI       0.61      0.27      0.38       266
        LCCE       0.48      0.90      0.62       459
        LCGE       0.73      0.81      0.76       424
        LCVE       0.75      0.47      0.58       277
        MCCE       0.78      0.17      0.28       186
        MCGE       0.76      0.61      0.68       237
        MCVE       0.62      0.06      0.11        83
        MLCE       0.46      0.38      0.42       364
        MLGE       0.62      0.03      0.05       313
        MLVE       0.70      0.39      0.50       169
        SCCE       0.35      0.99      0.52       431
        SCGE       0.90      0.44      0.59       314
        SCVE       0.81      0.22      0.34       162

    accuracy                           0.53      3685
   macro avg       0.66      0.44      0.45      3685
weighted avg       0.63      0.53      0.49      3685



In [96]:
print(classification_report(comparing_df.true, comparing_df.predicted))

              precision    recall  f1-score   support

        EIEI       0.61      0.27      0.38       266
        LCCE       0.48      0.90      0.62       459
        LCGE       0.73      0.81      0.76       424
        LCVE       0.75      0.47      0.58       277
        MCCE       0.78      0.17      0.28       186
        MCGE       0.76      0.61      0.68       237
        MCVE       0.62      0.06      0.11        83
        MLCE       0.46      0.38      0.42       364
        MLGE       0.62      0.03      0.05       313
        MLVE       0.70      0.39      0.50       169
        SCCE       0.35      0.99      0.52       431
        SCGE       0.90      0.44      0.59       314
        SCVE       0.81      0.22      0.34       162

    accuracy                           0.53      3685
   macro avg       0.66      0.44      0.45      3685
weighted avg       0.63      0.53      0.49      3685



In [97]:
comparing_df.query('''true == 'LCCE' and predicted == 'SCCE' ''')

Unnamed: 0,true,predicted
46,LCCE,SCCE
327,LCCE,SCCE
328,LCCE,SCCE
551,LCCE,SCCE
552,LCCE,SCCE
931,LCCE,SCCE
932,LCCE,SCCE
1030,LCCE,SCCE
1487,LCCE,SCCE
1497,LCCE,SCCE


28284

In [158]:
no = 2343
crsp_fundno = row_info.reset_index().loc[no].crsp_fundno

most_common_stocks_fund(year=2017, crsp_fundno='28284')

Average of most held stocks for one fund in one year:  

Guggenheim Funds Trust: Guggenheim StylePlus - Large Core Fund; Class A Shares 

crsp_fundno:                            28284 
Year:                                   2017 
Number of observations in that year:    2


Unnamed: 0,security_name,percent
0,ISHARES TRUST,5.511852
1,CEPHALON INC,3.16945
2,FASTENAL COMPANY,2.638195
3,SYMANTEC CORP,2.613235
4,ACCREDO HEALTH INC,2.447079
5,PIPER JAFFRAY COMPANIES,2.341384
6,FILENET CORP,2.331019
7,D S T SYSTEMS INC DEL,2.230707
8,PROVIDIAN FINANCIAL CORP,2.216896
9,LAM RESH CORP,2.213977


In [153]:
def most_common_stocks_fund(year,crsp_fundno):
    "This prints a passed string into this function"
    # Enter date for which most common holdings are calculated
    year = year
    crsp_fundno = crsp_fundno
    row_info_l = row_info.copy()

    holdings_coo = holdings.tocoo()

    df_sparse = pd.DataFrame({'row'  : holdings_coo.row,
                              'col'  : holdings_coo.col,
                              'data' : holdings_coo.data})

    row_info_l = row_info_l.assign(year = row_info_l['report_dt'].dt.year)
    df_sparse = df_sparse.merge(row_info_l[['year','row','crsp_fundno']],how='left',on='row')
    my_filter = '''year == @year and crsp_fundno == @crsp_fundno '''
    no_unique_funds = row_info_l.query(my_filter).shape[0]

    sum_col = (df_sparse
               .query(my_filter)
               .groupby(by = ['col'])
               .mean()
               .sort_values('data',ascending = False)
               .join(col_info[['security_name','col']],how='left')
               .assign(percent = lambda x:  x.data)
               .drop(columns=['row','data','col','year','crsp_fundno'])
               .reset_index(drop=True)
               .head(10))
    
    print(
        'Average of most held stocks for one fund in one year: ','\n\n'
        '{}'.format(row_info.query('crsp_fundno == @crsp_fundno').iloc[0,2]),'\n\n'
        'crsp_fundno:                            {}'.format(crsp_fundno),'\n'
        'Year:                                   {}'.format(year),'\n'
        'Number of observations in that year:    {}'.format(no_unique_funds))

    return sum_col