In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
import pickle

from sklearn.ensemble import RandomForestClassifier

In [2]:
X_train, X_test, y_train, y_test = pickle.load(open('data', 'rb'))

In [3]:
def custom_score(x1, x2, alpha):
    return (x1+x2)/(np.abs(x1-x2)+alpha)

def custom_score_norm(x1, x2, alpha=1):
    return custom_score(x1, x2, alpha)/custom_score(1, 1, alpha)

def get_metric(y_pred, y_test, ret_C=False):
    C = confusion_matrix(y_test.flatten(), y_pred.flatten(), labels=[-1, 0, 1])
    r_p = C[0, 0]/(C[0,0]+C[2,0]) #rejects precision
    c_p = C[2,2]/(C[0,2]+C[2,2]) #convergence precision

    r_r = C[0, 0]/(C[0,0]+C[0,2]) #rejects precision
    c_r = C[2,2]/(C[2,0]+C[2,2]) #convergence precision

    f1_r = 2*(r_p*r_r)/(r_p+r_r) # f1 reject
    f1_c = 2*(c_p*c_r)/(c_p+c_r) # f1 convergence

    score = custom_score_norm(f1_r, f1_c)

    print(f'rejection precision: {int(r_p*100)}, conversion precision: {int(c_p*100)}')
    print(f'rejection recall: {int(r_r*100)}, conversion recall: {int(c_r*100)}')
    if ret_C:
        return C, score
    return score

# Random Forest

## Run of grid search CV for this model requires extensive memory resources, so I build RandomForest with mostly default parameters.

In [4]:
rf_multi_calss = RandomForestClassifier(n_estimators=10, n_jobs=-1, verbose=0, class_weight='balanced')
rf_multi_calss.fit(X_train, y_train)

In [5]:
y_pred = rf_multi_calss.predict(X_test)
C, score = get_metric(y_pred, y_test, True)
print(C, score)

rejection precision: 84, conversion precision: 52
rejection recall: 98, conversion recall: 9
[[284756  78560   5143]
 [ 77236 910305    842]
 [ 51222  11778   5591]] 0.3082685625223688


In [6]:
(y_pred==1).any(1).sum()/len(y_pred)

0.10896969552409688

In [10]:
gain_perc = 0.1*0.52
gain_perc

0.052000000000000005

# There is high precision, but very low recall. Meaning that we can trust our model, but it doesn't return a very large fraction of possible convergences.

In [8]:
rf_multi_calss = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=0, class_weight='balanced')
rf_multi_calss.fit(X_train, y_train)

In [9]:
y_pred = rf_multi_calss.predict(X_test)
C, score = get_metric(y_pred, y_test, True)
print(C, score)
(y_pred==1).any(1).sum()/len(y_pred)

rejection precision: 84, conversion precision: 90
rejection recall: 99, conversion recall: 6
[[290764  77332    363]
 [ 53889 934456     38]
 [ 53952  11079   3560]] 0.2864520174821178


0.02904029863206478

In [11]:
gain_perc = 0.029*0.9
gain_perc

0.0261

# Moreover, we get positive predictions only for less than 3% of the data. Even though it is 92% correct, we gain more from KNN model.