# Classification

## Description

In [40]:
import feather
import numpy as np
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix, classification_report, SCORERS

import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Setup

In [2]:
# Convert to date format
begin_date = '2018-01-01' 
end_date = '2019-01-01'

## Load Data

### Returns

In [3]:
path = '../data/processed/returns_s.feather'
returns = feather.read_dataframe(path)
returns.shape

(1627288, 4)

In [4]:
returns['date'] =  pd.to_datetime(returns['caldt'], format='%Y-%m-%d')
returns = returns[returns['date'] >= begin_date]

### Summary

In [5]:
path = '../data/processed/holdings_summary_s.feather'
summary = feather.read_dataframe(path)
summary.shape

(163848, 9)

In [6]:
# Convert to date format and filter based on date with mask that is also used on holdings
summary['date'] =  pd.to_datetime(summary['report_dt'], format='%Y-%m-%d')
date_mask = summary['date'] > begin_date
summary = summary[date_mask]
summary.shape

(6155, 10)

### Holdings

In [7]:
path = '../data/processed/holdings_s.npz'
holdings = sparse.load_npz(path)
holdings.shape

(163848, 60533)

In [8]:
holdings = holdings[date_mask.values]

In [9]:
print('Shape of summary information')
print(summary.shape)
print('Shape of holding information')
print(holdings.shape)

Shape of summary information
(6155, 10)
Shape of holding information
(6155, 60533)


## Knn 

In [13]:
X = holdings
y = list(summary['lipper_class'].values)

In [27]:
comparing_df = pd.DataFrame({
    'true' : y,
    'predicted' : np.nan
})

In [28]:
comparing_df.head()

Unnamed: 0,true,predicted
0,LCVE,
1,LCVE,
2,LCVE,
3,LCVE,
4,LCVE,


In [15]:
print(X.shape)
print(len(y))

(6155, 60533)
6155


In [55]:
neigh = KNeighborsClassifier(n_neighbors=30, n_jobs = -1)
neigh.fit(holdings,y) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
           weights='uniform')

In [56]:
comparing_df.predicted = neigh.predict(X)

In [58]:
round(pd.crosstab(comparing_df.true, comparing_df.predicted, margins=True, normalize='all') * 100,2)

predicted,LCCE,LCGE,LCVE,MCCE,MCGE,MCVE,MLCE,MLGE,MLVE,SCCE,SCGE,SCVE,All
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LCCE,7.15,0.1,0.18,0.02,0.0,0.0,6.14,0.0,0.34,0.16,0.0,0.0,14.09
LCGE,0.49,11.23,0.0,0.0,0.02,0.0,0.63,0.08,0.0,0.24,0.0,0.0,12.69
LCVE,0.36,0.0,3.83,0.0,0.0,0.0,0.93,0.0,2.52,0.06,0.0,0.0,7.7
MCCE,0.03,0.02,0.0,4.86,0.0,0.1,0.34,0.0,0.0,1.53,0.0,0.0,6.87
MCGE,0.06,0.0,0.0,0.8,3.31,0.0,0.08,0.0,0.0,1.36,0.08,0.0,5.7
MCVE,0.0,0.0,0.0,0.73,0.0,0.75,0.47,0.0,0.0,0.55,0.0,0.0,2.5
MLCE,0.19,0.03,0.0,0.11,0.0,0.0,8.66,0.03,0.03,1.33,0.0,0.0,10.4
MLGE,0.1,2.62,0.0,0.57,0.58,0.0,2.14,1.46,0.0,0.52,0.0,0.0,7.99
MLVE,0.0,0.0,0.16,0.19,0.0,0.02,1.77,0.0,3.05,0.15,0.0,0.02,5.36
SCCE,0.02,0.0,0.0,0.31,0.0,0.0,0.0,0.0,0.0,13.79,0.0,0.0,14.12


In [59]:
pd.crosstab(comparing_df.true, comparing_df.predicted, margins=True)

predicted,LCCE,LCGE,LCVE,MCCE,MCGE,MCVE,MLCE,MLGE,MLVE,SCCE,SCGE,SCVE,All
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LCCE,440,6,11,1,0,0,378,0,21,10,0,0,867
LCGE,30,691,0,0,1,0,39,5,0,15,0,0,781
LCVE,22,0,236,0,0,0,57,0,155,4,0,0,474
MCCE,2,1,0,299,0,6,21,0,0,94,0,0,423
MCGE,4,0,0,49,204,0,5,0,0,84,5,0,351
MCVE,0,0,0,45,0,46,29,0,0,34,0,0,154
MLCE,12,2,0,7,0,0,533,2,2,82,0,0,640
MLGE,6,161,0,35,36,0,132,90,0,32,0,0,492
MLVE,0,0,10,12,0,1,109,0,188,9,0,1,330
SCCE,1,0,0,19,0,0,0,0,0,849,0,0,869


In [60]:
print(classification_report(comparing_df.true, comparing_df.predicted))

              precision    recall  f1-score   support

        LCCE       0.84      0.51      0.63       867
        LCGE       0.80      0.88      0.84       781
        LCVE       0.92      0.50      0.65       474
        MCCE       0.62      0.71      0.66       423
        MCGE       0.84      0.58      0.69       351
        MCVE       0.87      0.30      0.44       154
        MLCE       0.41      0.83      0.55       640
        MLGE       0.93      0.18      0.31       492
        MLVE       0.51      0.57      0.54       330
        SCCE       0.46      0.98      0.63       869
        SCGE       0.96      0.22      0.35       527
        SCVE       0.94      0.06      0.12       247

   micro avg       0.60      0.60      0.60      6155
   macro avg       0.76      0.53      0.53      6155
weighted avg       0.73      0.60      0.58      6155



In [47]:
print(classification_report(comparing_df.true, comparing_df.predicted))

              precision    recall  f1-score   support

        LCCE       0.86      0.72      0.79       867
        LCGE       0.86      0.92      0.89       781
        LCVE       0.90      0.69      0.78       474
        MCCE       0.60      0.78      0.68       423
        MCGE       0.85      0.68      0.76       351
        MCVE       0.87      0.31      0.46       154
        MLCE       0.52      0.78      0.63       640
        MLGE       0.87      0.48      0.62       492
        MLVE       0.64      0.56      0.60       330
        SCCE       0.59      0.97      0.74       869
        SCGE       1.00      0.52      0.69       527
        SCVE       0.94      0.47      0.63       247

   micro avg       0.72      0.72      0.72      6155
   macro avg       0.79      0.66      0.69      6155
weighted avg       0.78      0.72      0.72      6155

