In [19]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import *
import dill as pkl

In [48]:
def get_gap(g, h, x_val, y_val, x_val_predictions):
    """
    Evaluates the hypothesis model on the group and compares it to the current model's perfromance on the group.
    """
    loss_fn = sk.metrics.mean_squared_error

    indices_val = g(x_val).astype('bool')
    if indices_val.sum() <= 1:
        return False

    model_error_val = loss_fn(y_val[indices_val], x_val_predictions[indices_val])
    hypothesis_preds_val = h(x_val[indices_val])
    hypothesis_error_val = loss_fn(y_val[indices_val], hypothesis_preds_val)

    return model_error_val, hypothesis_error_val, indices_val.sum()/len(indices_val)

In [49]:
def save_pkl(g, h, suffix):
    with open(f'g-{suffix}.pkl', 'wb') as file:
        pkl.dump(g, file)

    with open(f'h-{suffix}.pkl', 'wb') as file:
        pkl.dump(h, file)

In [50]:
x_train = pd.read_csv('training_data.csv') 
y_train = np.genfromtxt('training_labels.csv', delimiter=',', dtype = float)
print(f'x_train.shape: {x_train.shape}')

y_mean = y_train.mean()
y_std = y_train.std()

x_train.shape: (340134, 21)


In [51]:
prediction_path = 'training_predictions.csv'
x_predictions = np.loadtxt(prediction_path, delimiter=",", dtype=str).astype('float64')
print(f'x_predictions.shape: {x_predictions.shape}')

x_predictions.shape: (340134,)


In [52]:
classifiers = [
    sk.neural_network.MLPClassifier(max_iter=100),
    sk.ensemble.AdaBoostClassifier(),
    sk.ensemble.GradientBoostingClassifier(),
    sk.naive_bayes.BernoulliNB(),
]

def get_best_classifier(X, y):
    best_score = float('inf')
    best_classifier = None
    for clf in classifiers:
        clf.fit(X, y)
        score = clf.score(X, y)
        if score < best_score:
            best_score = score
            best_classifier = clf
    
    print(best_classifier)
    return best_classifier


In [63]:
regressors = [
    sk.neural_network.MLPRegressor(max_iter=1000, verbose=True),
    sk.ensemble.AdaBoostRegressor(),
    sk.ensemble.GradientBoostingRegressor(),
    sk.linear_model.ElasticNet(),
    sk.linear_model.TweedieRegressor(),
]

def get_best_regressor(X, y):
    best_score = float('inf')
    best_regressor = None
    for reg in regressors:
        print(reg)
        reg.fit(X, y)
        score = reg.score(X, y)
        if score < best_score:
            best_score = score
            best_regressor = reg
    
    print(best_regressor)
    return best_regressor


In [54]:
def get_g_clustering(X, y, i):
    """
    Defines groups based on K-Means clustering on the one-hot features.
    """
    enc = sk.preprocessing.OneHotEncoder().fit(X)
    X = enc.transform(X).toarray()
    clustering = sk.cluster.KMeans(n_clusters=24, n_init='auto', random_state=42).fit(X)

    def g(X):
        X = enc.transform(X).toarray()
        predictions = clustering.predict(X)
        return predictions == i

    return g, enc


In [59]:
def get_g_meta(X, y, x_train_predictions):
    """
    Defines group as the top 10% worst errors based on predictions lifted from training_predictions.csv.
    """
    errors = np.abs(x_train_predictions - y)
    threshold = np.sort(errors)[-int(len(errors)*0.1)]
    g_y = errors >= threshold
    print(g_y.sum())

    enc = sk.preprocessing.OneHotEncoder().fit(X)
    # poly = sk.preprocessing.PolynomialFeatures(2).fit(enc.transform(X))
    def transform(X):
        X = enc.transform(X).toarray() if enc else X
        # X = poly.transform(X) if poly else X
        return X

    X = transform(X)
    g_clf = get_best_classifier(X, g_y)
    print(g_clf.predict(X).sum())
    
    def g(X):
        X = transform(X)
        return g_clf.predict(X)
    return g, transform


In [60]:
def get_h(X, y, g, transform=None):
    indices = g(X)
    print(indices.sum())

    X = transform(X) if transform else X
    clf = get_best_regressor(X[indices], (y[indices] - y_mean) / y_std)

    def h(X):
        X = transform(X) if transform else X
        return clf.predict(X) * y_std + y_mean

    return h

In [61]:
x_train_subset, x_val, y_train_subset, y_val, x_predictions_subset, x_predictions_val = sk.model_selection.train_test_split(x_train, y_train, x_predictions, test_size=.15, random_state=42)

get_g = get_g_meta
g, transform = get_g(x_train_subset, y_train_subset, x_predictions_subset)
h = get_h(x_train_subset, y_train_subset, g, transform)

28911
BernoulliNB()
14591
14591
MLPRegressor(max_iter=10000, verbose=True)
Iteration 1, loss = 0.47147533
Iteration 2, loss = 0.40131034
Iteration 3, loss = 0.38948811
Iteration 4, loss = 0.38268502
Iteration 5, loss = 0.37480038
Iteration 6, loss = 0.36860001
Iteration 7, loss = 0.35830349
Iteration 8, loss = 0.34848020
Iteration 9, loss = 0.33781585
Iteration 10, loss = 0.32679664
Iteration 11, loss = 0.31457134
Iteration 12, loss = 0.30125425
Iteration 13, loss = 0.28877046
Iteration 14, loss = 0.27783167
Iteration 15, loss = 0.26474479
Iteration 16, loss = 0.25286565
Iteration 17, loss = 0.24200414
Iteration 18, loss = 0.23378465
Iteration 19, loss = 0.22500981
Iteration 20, loss = 0.21510652
Iteration 21, loss = 0.20813317
Iteration 22, loss = 0.20075035
Iteration 23, loss = 0.19287889
Iteration 24, loss = 0.18732835
Iteration 25, loss = 0.18226378
Iteration 26, loss = 0.17523313
Iteration 27, loss = 0.17246938
Iteration 28, loss = 0.16479091
Iteration 29, loss = 0.16005802
Iterat

In [64]:
print(f'Found {g(x_train_subset).sum()} in group from x_train_subset')
print()

model_error, h_error, fraction = get_gap(g, h, x_val, y_val, x_predictions_val)
print(f'Baseline error: {model_error}')
print(f'Hypothesis group error: {h_error}')
print(f'Error gap: {model_error - h_error}')
print(f'Group fraction in x_val: {fraction}')
print()

Found 14591 in group from x_train_subset

Baseline error: 399555865.28285867
Hypothesis group error: 564862844.4618349
Error gap: -165306979.17897624
Group fraction in x_val: 0.05105740773407028



In [100]:
g, enc = get_g(x_train, y_train, x_predictions)
h = get_h(x_train, y_train, g, enc)
save_pkl(g, h, f'meta')

34013
14993
14993
