In [121]:
import pandas as pd
import numpy as np
from joblib import load
from scipy.sparse import vstack
from scipy.stats import sem
from sklearn.model_selection import cross_validate, RepeatedKFold
from sklearn.metrics import precision_score, roc_auc_score

In [82]:
data = load('./data/glaw/data_vectorized_0.sav')
df_labels = pd.read_csv('./data/glaw/labels.csv', index_col=0)
print(len(data))
df_labels.head()

429


Unnamed: 0,labels
0,"[0, 0, 0, 1, 0]"
1,"[0, 0, 1, 0]"
2,"[0, 0, 1]"
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
4,"[1, 0, 0, 0]"


In [83]:
df_labels['labels'] = df_labels['labels'].apply(eval)
multi_candidate_mask = df_labels['labels'].apply(lambda l: len(l) > 1).to_numpy()
df_labels = df_labels[multi_candidate_mask]
labels = df_labels['labels'].to_numpy()

data = np.array(data)
data = data[multi_candidate_mask]
len(data), len(labels)

(419, 419)

In [85]:
def reveal_spase(sparse_list):
    return vstack(sparse_list, format='csr')

def reveal_np(np_list):
    return np.concatenate(np_list, axis=0)

def reveal_set(data_, labels_):
    return reveal_spase(data_), reveal_np(labels_)

In [94]:
class CustomEstimator:
    def __init__(self, model=None):
        self.model = model

    def fit(self, X, Y=None):
        train_data_revealed, train_labels_revealed = reveal_set(X, Y)
        self.model.fit(train_data_revealed, train_labels_revealed)

        return self

    def get_params(self, deep=False):
        return {'model': self.model}

In [95]:
def evaluate(estimator, test_data_list, test_labels_list):
    """Evaluate estimator on each document from `test_data_list`

    :param estimator: sklearn estimator
    :param test_data_list: list of sparce matrices
    :param test_labels_list: 2d list of labels
    :return: precision and roc-auc scores (calculates by macro-averaging corresponding scores of each document)
    """
    precision_batch = []
    roc_auc_batch = []
    for test_data_batch, test_labels_batch in zip(test_data_list, test_labels_list):
        positive_class_proba = estimator.model.predict_proba(test_data_batch)[:, 1]

        max_p_idx = np.argmax(positive_class_proba)
        predicted_labels_batch = [0] * len(positive_class_proba)
        predicted_labels_batch[max_p_idx] = 1
        if len(test_labels_batch) == 1:
            print(test_labels_batch)
        precision_batch.append(precision_score(test_labels_batch, predicted_labels_batch))
        roc_auc_batch.append(roc_auc_score(test_labels_batch, positive_class_proba))

    return {
        'precision': np.average(precision_batch),
        'roc-auc': np.average(roc_auc_batch)
    }

In [141]:
def repeated_cross_validation(estimator, X, y, k=10, r=7, random_state=42, verbose=1):
    cv = RepeatedKFold(n_splits=k, n_repeats=r, random_state=random_state)
    result = cross_validate(
        estimator=estimator,
        X=X,
        y=y,
        scoring=evaluate,
        cv=cv,
        n_jobs=-1,
        verbose=verbose,
        return_train_score=True,
        error_score='raise',
    )

    return result

In [139]:
def statistics_from_cv(cv_result):
    scores_precision = cv_result['test_precision']
    scores_roc_auc = cv_result['test_roc-auc']

    return {
        'precision mean': np.mean(scores_precision),
        'precision sem': sem(scores_precision),
        'roc-auc mean': np.mean(scores_roc_auc),
        'roc-auc sem': sem(scores_roc_auc),
    }

In [144]:
def enumerate_repeats(estimator, X, y, from_=1, to=11, k=10, random_state=42):
    print("%3s %15s %15s %15s %15s" % ('r', 'roc-auc mean', 'roc-auc sem', 'precision mean', 'precision sem'))
    for r in range(from_, to):
        cv_results = repeated_cross_validation(estimator, X, y, random_state=random_state, r=r, k=k, verbose=0)
        stats = statistics_from_cv(cv_results)

        print("%3d %15.10f %15.10f %15.10f %15.10f" % (
            r, stats['roc-auc mean'], stats['roc-auc sem'], stats['precision mean'], stats['precision sem']))

In [133]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
estimator_mnb = CustomEstimator(model)

In [146]:
enumerate_repeats(estimator_mnb, data, labels)

  r    roc-auc mean     roc-auc sem  precision mean   precision sem
  1    0.9659664152    0.0022094528    0.9235772358    0.0093113416
  2    0.9660046844    0.0018321349    0.9235191638    0.0081205899
  3    0.9659507979    0.0015389978    0.9235965931    0.0063405159
  4    0.9665331883    0.0013284168    0.9236062718    0.0052300760
  5    0.9661960984    0.0013316978    0.9236469222    0.0048312151
  6    0.9662468052    0.0013263490    0.9240224545    0.0044977337
  7    0.9662695981    0.0012379529    0.9246557159    0.0041470142
  8    0.9664114042    0.0012118391    0.9251306620    0.0039806504
  9    0.9663713052    0.0011858848    0.9252419667    0.0037947436
 10    0.9664186934    0.0011844335    0.9257955865    0.0037138816


In [145]:
cv_results = repeated_cross_validation(estimator_mnb, data, labels, verbose=2)
stats = statistics_from_cv(cv_results)
stats

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    8.1s finished


{'precision mean': 0.9246557159449146,
 'precision sem': 0.004147014163838121,
 'roc-auc mean': 0.9662695980812572,
 'roc-auc sem': 0.0012379528829737313}