Find the confusion matrix

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

df_class = pd.read_csv('classification.csv')

tp, fp, tn, fn = 0, 0, 0, 0
# Create "sum" column (different weight in order to tell the 1-0/0-1 cases)
df_class['sum'] = 2*df_class['true'] + df_class['pred']
tp = df_class['sum'].loc[df_class['sum'] == 3].count()  
tn = df_class['sum'].loc[df_class['sum'] == 0].count()
fn = df_class['sum'].loc[df_class['sum'] == 2].count()
fp = df_class['sum'].loc[df_class['sum'] == 1].count()
# Confusion matrix
conf_mat = pd.DataFrame([[tp, fp], [fn, tn]], index=['a(x) = 1', 'a(x) = 0'], 
                                              columns=['y = 1', 'y = 0'])
conf_mat.head()
# Save answer   
# txt_file = open('conf_mat_ans.txt', 'w')
# arr2save = [tp, fp, fn, tn]
# for ii in arr2save:
#     txt_file.write('%g ' % np.round(ii, 2))
# txt_file.close()

Find accuracy metrics of the classificator:
$$accuracy = \frac{TP+TN}{TP+FP+FN+TN} $$
$$precision = \frac{TP}{TP+FP} $$
$$recall = \frac{TP}{TP+FN} $$
$$F score = \frac{2*precision*recall}{precision+recall} $$

In [2]:
accuracy = (tp+tn)/(tp+fp+fn+tn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f_score = 2*precision*recall/(precision+recall)
metrics = pd.Series([accuracy, precision, recall, f_score],
                    index=['accuracy', 'precision', 'recall', 'f_score'], name='Score')
metrics.head()
# Save answer   
# txt_file = open('metrics_ans.txt', 'w')
# arr2save = [metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f_score']]
# for ii in arr2save:
#     txt_file.write('%g ' % np.round(ii, 2))
# txt_file.close()


accuracy     0.535000
precision    0.558442
recall       0.421569
f_score      0.480447
Name: Score, dtype: float64

In [3]:
scores = pd.read_csv('scores.csv')
logreg = roc_auc_score(scores['true'], scores['score_logreg'])
svm = roc_auc_score(scores['true'], scores['score_svm'])
knn = roc_auc_score(scores['true'], scores['score_knn'])
tree = roc_auc_score(scores['true'], scores['score_tree'])
print('AUC-ROC scores:\nlogreg = %g \nsvm = %g \nknn = %g \ntree = %g' % (logreg, svm, knn, tree))
# Save answer   
# txt_file = open('auc_roc_ans.txt', 'w')
# txt_file.write(scores.columns[1])
# txt_file.close()

AUC-ROC scores:
logreg = 0.719188 
svm = 0.708683 
knn = 0.635154 
tree = 0.691927


In [4]:
def max_prec(pr):
    """
    Find the max value of precision when recal >= 0.7
    """
    precision = pr[0]
    recall = pr[1]
    max_prec = precision[np.nonzero(recall >= 0.7)].max()

    return max_prec

pr_logreg = precision_recall_curve(scores['true'], scores['score_logreg'])
pr_svm = precision_recall_curve(scores['true'], scores['score_svm'])
pr_knn = precision_recall_curve(scores['true'], scores['score_knn'])
pr_tree = precision_recall_curve(scores['true'], scores['score_tree'])

pr_max_acc = pd.Series([max_prec(pr_logreg), 
                      max_prec(pr_svm), 
                      max_prec(pr_knn), 
                      max_prec(pr_tree),])
# Save answer   
# txt_file = open('max_prec_ans.txt', 'w')
# txt_file.write(scores.columns[pr_max_acc.argmax()+1])
# txt_file.close()