# Classification

## Description

In [1]:
import feather
import numpy as np
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix, classification_report, SCORERS

import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Setup

In [2]:
# Convert to date format
begin_date = '2017-01-01' 
end_date = '2018-01-01'

## Load Data

In [3]:
### Returns
path = '../data/processed/returns.feather'
returns = feather.read_dataframe(path)

returns = returns.query('caldt >= @begin_date and caldt <= @end_date')

### row_info
path = '../data/processed/row_info.feather'
row_info = feather.read_dataframe(path)

row_info = row_info.query('report_dt >= @begin_date and report_dt <= @end_date')

row_info.reset_index(inplace=True,drop=True)
row_info['row'] = row_info.index

### col_info
path = '../data/processed/col_info.feather'
col_info = feather.read_dataframe(path)

### Holdings
path = '../data/processed/holdings.npz'
holdings = sparse.load_npz(path)

holdings = holdings[row_info.row.values]

print('Shape of row_info information')
print(row_info.shape)
print('Shape of holding information')
print(holdings.shape)

Shape of row_info information
(3988, 8)
Shape of holding information
(3988, 56820)


In [4]:
## Knn 

In [5]:
X = holdings
y = list(row_info['lipper_class'].values)

In [6]:
comparing_df = pd.DataFrame({
    'true' : y,
    'predicted' : np.nan
})

In [7]:
comparing_df.head()

Unnamed: 0,true,predicted
0,LCVE,
1,LCVE,
2,LCVE,
3,LCVE,
4,MCCE,


In [8]:
print(X.shape)
print(len(y))

(3988, 56820)
3988


In [9]:
neigh = KNeighborsClassifier(n_neighbors=30, n_jobs = -1)
neigh.fit(holdings,y) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
                     weights='uniform')

In [10]:
comparing_df.predicted = neigh.predict(X)

In [11]:
round(pd.crosstab(comparing_df.true, comparing_df.predicted, margins=True, normalize='all') * 100,2)

predicted,EIEI,LCCE,LCGE,LCVE,MCCE,MCGE,MCVE,MLCE,MLGE,MLVE,SCCE,SCGE,SCVE,All
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EIEI,1.23,3.39,0.6,0.03,0.03,0.05,0.03,0.23,0.03,0.0,1.53,0.15,0.03,7.3
LCCE,0.35,8.43,0.8,0.13,0.0,0.05,0.05,0.48,0.15,0.0,1.65,0.2,0.08,12.36
LCGE,0.28,5.29,2.66,0.13,0.0,0.03,0.03,0.5,0.15,0.03,2.01,0.25,0.0,11.33
LCVE,0.45,3.86,0.8,0.33,0.0,0.08,0.05,0.28,0.13,0.05,1.53,0.05,0.0,7.6
MCCE,0.18,2.91,0.48,0.05,0.05,0.15,0.0,0.33,0.08,0.0,1.1,0.13,0.0,5.44
MCGE,0.23,3.51,0.48,0.13,0.0,0.33,0.08,0.18,0.05,0.0,1.18,0.1,0.03,6.27
MCVE,0.1,1.25,0.23,0.0,0.0,0.05,0.05,0.08,0.0,0.03,0.43,0.1,0.0,2.31
MLCE,0.43,5.17,0.85,0.13,0.05,0.05,0.08,1.4,0.1,0.03,1.33,0.1,0.0,9.7
MLGE,0.4,4.46,0.95,0.1,0.03,0.05,0.03,0.28,0.53,0.08,1.35,0.23,0.0,8.48
MLVE,0.4,2.26,0.5,0.05,0.0,0.03,0.05,0.18,0.1,0.15,0.78,0.2,0.0,4.69


In [12]:
pd.crosstab(comparing_df.true, comparing_df.predicted, margins=True)

predicted,EIEI,LCCE,LCGE,LCVE,MCCE,MCGE,MCVE,MLCE,MLGE,MLVE,SCCE,SCGE,SCVE,All
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EIEI,49,135,24,1,1,2,1,9,1,0,61,6,1,291
LCCE,14,336,32,5,0,2,2,19,6,0,66,8,3,493
LCGE,11,211,106,5,0,1,1,20,6,1,80,10,0,452
LCVE,18,154,32,13,0,3,2,11,5,2,61,2,0,303
MCCE,7,116,19,2,2,6,0,13,3,0,44,5,0,217
MCGE,9,140,19,5,0,13,3,7,2,0,47,4,1,250
MCVE,4,50,9,0,0,2,2,3,0,1,17,4,0,92
MLCE,17,206,34,5,2,2,3,56,4,1,53,4,0,387
MLGE,16,178,38,4,1,2,1,11,21,3,54,9,0,338
MLVE,16,90,20,2,0,1,2,7,4,6,31,8,0,187


In [13]:
print(classification_report(comparing_df.true, comparing_df.predicted))

              precision    recall  f1-score   support

        EIEI       0.25      0.17      0.20       291
        LCCE       0.16      0.68      0.26       493
        LCGE       0.26      0.23      0.25       452
        LCVE       0.25      0.04      0.07       303
        MCCE       0.33      0.01      0.02       217
        MCGE       0.30      0.05      0.09       250
        MCVE       0.10      0.02      0.04        92
        MLCE       0.29      0.14      0.19       387
        MLGE       0.34      0.06      0.10       338
        MLVE       0.40      0.03      0.06       187
        SCCE       0.21      0.36      0.27       466
        SCGE       0.22      0.06      0.09       332
        SCVE       0.00      0.00      0.00       180

    accuracy                           0.20      3988
   macro avg       0.24      0.14      0.13      3988
weighted avg       0.24      0.20      0.16      3988



In [14]:
print(classification_report(comparing_df.true, comparing_df.predicted))

              precision    recall  f1-score   support

        EIEI       0.25      0.17      0.20       291
        LCCE       0.16      0.68      0.26       493
        LCGE       0.26      0.23      0.25       452
        LCVE       0.25      0.04      0.07       303
        MCCE       0.33      0.01      0.02       217
        MCGE       0.30      0.05      0.09       250
        MCVE       0.10      0.02      0.04        92
        MLCE       0.29      0.14      0.19       387
        MLGE       0.34      0.06      0.10       338
        MLVE       0.40      0.03      0.06       187
        SCCE       0.21      0.36      0.27       466
        SCGE       0.22      0.06      0.09       332
        SCVE       0.00      0.00      0.00       180

    accuracy                           0.20      3988
   macro avg       0.24      0.14      0.13      3988
weighted avg       0.24      0.20      0.16      3988



In [15]:
comparing_df.query('''true == 'LCCE' and predicted == 'SCCE' ''')

Unnamed: 0,true,predicted
141,LCCE,SCCE
170,LCCE,SCCE
319,LCCE,SCCE
320,LCCE,SCCE
327,LCCE,SCCE
361,LCCE,SCCE
362,LCCE,SCCE
385,LCCE,SCCE
386,LCCE,SCCE
423,LCCE,SCCE


In [18]:
no = 2343
crsp_fundno = row_info.reset_index().loc[no].crsp_fundno

most_common_stocks_fund(year=2017, crsp_fundno='28284')

Average of most held stocks for one fund in one year:  

Guggenheim Funds Trust: Guggenheim StylePlus - Large Core Fund; Class A Shares 

crsp_fundno:                            28284 
Year:                                   2017 
Number of observations in that year:    2


Unnamed: 0,security_name,percent
0,ECHOSTAR COMMUNICATIONS CORP NEW,3.358534
1,CONOCOPHILLIPS,3.339998
2,INTEGRA LIFESCIENCES HLDNGS CORP,3.32785
3,CHEVRON CORP NEW,3.23
4,EXXON MOBIL CORP,2.889999
5,JPMORGAN CHASE & CO,2.849998
6,CYPRESS SEMICONDUCTOR CORP,2.720417
7,PHARMACEUTICAL RESOURCES INC,2.624249
8,BANK OF AMERICA CORP,2.619999
9,CEPHALON INC,2.54199


In [17]:
def most_common_stocks_fund(year,crsp_fundno):
    "This prints a passed string into this function"
    # Enter date for which most common holdings are calculated
    year = year
    crsp_fundno = crsp_fundno
    row_info_l = row_info.copy()

    holdings_coo = holdings.tocoo()

    df_sparse = pd.DataFrame({'row'  : holdings_coo.row,
                              'col'  : holdings_coo.col,
                              'data' : holdings_coo.data})

    row_info_l = row_info_l.assign(year = row_info_l['report_dt'].dt.year)
    df_sparse = df_sparse.merge(row_info_l[['year','row','crsp_fundno']],how='left',on='row')
    my_filter = '''year == @year and crsp_fundno == @crsp_fundno '''
    no_unique_funds = row_info_l.query(my_filter).shape[0]

    sum_col = (df_sparse
               .query(my_filter)
               .groupby(by = ['col'])
               .mean()
               .sort_values('data',ascending = False)
               .join(col_info[['security_name','col']],how='left')
               .assign(percent = lambda x:  x.data)
               .drop(columns=['row','data','col','year','crsp_fundno'])
               .reset_index(drop=True)
               .head(10))
    
    print(
        'Average of most held stocks for one fund in one year: ','\n\n'
        '{}'.format(row_info.query('crsp_fundno == @crsp_fundno').iloc[0,2]),'\n\n'
        'crsp_fundno:                            {}'.format(crsp_fundno),'\n'
        'Year:                                   {}'.format(year),'\n'
        'Number of observations in that year:    {}'.format(no_unique_funds))

    return sum_col