# Windowed predictors
Evaluating scikit estimators using windows

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from utils import get_arrays

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [None]:
pd.set_option('display.max_rows', 150)

Load datasets

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_ids = train_df['id'].unique()

In [None]:
entries = []
for pos, idx in enumerate(train_ids):
    df = train_df.loc[train_df['id'] == idx, :]
    entries.append(df.shape[0])
entries = np.array(entries)

for i in range(1, 11):
    print(f'{i:2d}: {100 * sum(entries < i) / len(entries):6.2f}%')

Instantiating report table:

In [None]:
clf_table = pd.DataFrame(columns=['name', 'window_size', 'accuracy', 'AUC']).set_index(['name', 'window_size'])

Converting test labels and sample weight to arrays

In [None]:
test_idx = ~pd.isnull(test_df['disengage'])
y_test = test_df.loc[test_idx, 'disengage']
sample_weight = test_df.loc[test_idx, 'num_hit']

Running models

In [None]:
for window_size in range(1, 21):
    _, X_train, y_train = get_arrays(train_df, window_size)
    test_rows, X_test, _ = get_arrays(test_df, window_size)

    print(f'Showing results with window_size = {window_size}')

    for clf_name, clf in [('Random Forest', RandomForestClassifier(random_state=0)),
                          ('Logistic Regression', LogisticRegression(random_state=0)),
                          ('Perceptron', Perceptron(random_state=0)),
                          ('Gaussian Naive Bayes', GaussianNB()),
                          ('KNearest Neighbours, K = 15', KNeighborsClassifier(15)),
                          ('Multilayer Perceptron', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2),
                                                                  random_state=1))]:
        clf.fit(X_train, y_train)
        raw_pred = clf.predict(X_test)
        pred = pd.Series(0, index=y_test.index)
        pred.loc[test_rows] = raw_pred
        report = classification_report(y_test, pred, sample_weight=sample_weight)
        print(f'{clf_name}:')
        print(report)
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
        auc = metrics.auc(fpr, tpr)
        acc = metrics.accuracy_score(y_test, pred)
        clf_table.loc[(clf_name, window_size), ['accuracy', 'AUC']] = acc, auc

Running baseline methods:
- Every HIT implies that the user will disengage soon

In [None]:
pred = np.zeros(y_test.values.shape)
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred,
                                         sample_weight=sample_weight, pos_label=1)
auc = metrics.auc(fpr, tpr)
acc = metrics.accuracy_score(y_test, np.zeros(y_test.values.shape))
clf_table.loc[('All zeroes', '--'), ['accuracy', 'AUC']] = acc, auc

- Every HIT implies that the user will not disengage soon

In [None]:
pred = np.ones(y_test.values.shape)
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred,
                                         sample_weight=sample_weight, pos_label=1)
auc = metrics.auc(fpr, tpr)
acc = metrics.accuracy_score(y_test, pred)
clf_table.loc[('All ones', '--'), ['accuracy', 'AUC']] = acc, auc

In [None]:
print(clf_table)