In [1]:
# Imports
import numpy as np
import json
import lcdb
from sklearn.linear_model import LinearRegression, LogisticRegression
import sklearn.model_selection
from directencoder import DirectEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import pandas as pd
import scipy
from tqdm import tqdm

In [2]:
# Data Init
db = pd.read_csv('data/database-accuracy.csv')
monotonicity_violations = pd.read_csv('data/monotonicityviolations_logloss.csv')
encoder = DirectEncoder(2)

outer_seed = 0
inner_seed_index = 0
num_seeds = 5
outer_seeds = list(range(0,10))
inner_seeds = list(range(num_seeds * inner_seed_index, num_seeds * (inner_seed_index + 1)))

In [3]:
def calc_slopes_by_train(anchors, openml_id, learner_name, outer_seeds, inner_seeds, plotting=False):
    X, y = lcdb.get_dataset(openml_id)
    slopes = []
    n = len(anchors)
    
    # Code from LCDB paper
    learner_params = {}
    if learner_name == "SVC_linear":
        learner_name = "sklearn.svm.LinearSVC"
    elif learner_name == "SVC_poly":
        learner_name = "sklearn.svm.SVC"
        learner_params = {"kernel": "poly"}
    elif learner_name == "SVC_rbf":
        learner_name = "sklearn.svm.SVC"
        learner_params = {"kernel": "rbf"}
    elif learner_name == "SVC_sigmoid":
        learner_name = "sklearn.svm.SVC"
        learner_params = {"kernel": "sigmoid"}
    
    
    for idx,anchor in enumerate(anchors):
        points = None
        if idx == 0:
            points = range(anchor, anchor+21)
        if idx == n - 1:
            points = range(anchor-20, anchor+1)
        else:
            points = range(anchor-10, anchor+11)
        points = np.array(points).reshape(-1,1)
        err_lin_reg = []
        err_lin_points = []
        for anch in points:
            measurements = []
            for outer_seed in outer_seeds:
                for inner_seed in inner_seeds:
                    try:
                        info = lcdb.get_entry(learner_name, {}, X,y, anch[0], outer_seed, inner_seed)
                        y_pred = encoder.decode_label_vector(info['y_hat_train'])
                        y_true = encoder.decode_label_vector(info['y_train'])
                        
                        err = 1 - metrics.accuracy_score(y_true, y_pred)
                        err_lin_reg.append(err)
                        err_lin_points.append(anch)
                    except Exception as e:
                        print("Invalid " + str(openml_id) + " " + learner_name + " " + e)
#             err_lin_reg.append(np.mean(measurements))
            
        lin_reg = LinearRegression()
        lin_reg.fit(err_lin_points, err_lin_reg)
        
#         a = lin_reg.coef_[0]
#         b=0
#         fst = a + points[0] + b
#         lst = a + points[19] + b

        if plotting == True:
            plt.plot(points, lin_reg.predict(points), color='k')

        slopes.append(lin_reg.coef_[0])
    return slopes

In [4]:
def plot_curve(anchors, errors, color='green'):

    print(anchors)
    print(errors)
    n = min(len(anchors), len(errors))
    plt.scatter(anchors[0:n], errors[0:n], color=color)
    plt.plot(anchors[0:n], errors[0:n], color=color, label='monotone')

def plot_monotonicity(anchors, err, openml_id, learner_name, slopes=None):
    fig,ax = plt.subplots()
    non_monotone_flag = False    
    max_violation_anchor = monotonicity_violations[(monotonicity_violations['openmlid'] == openml_id) & (monotonicity_violations['learner'] == learner_name)]['max_violation_anchor'].iloc[0]
    
    if slopes == None:
        return
    
    errors = [np.mean(x) for x in err]
    plot_curve(anchors, errors)
    
    for idx, anchor in enumerate(anchors):
        if idx >= len(slopes) or idx >= len(errors):
            continue
            
        if slopes[idx] >= 0:
            plt.scatter(anchor, errors[idx], color='red')
#             non_monotone_flag = True
            if (idx < len(anchors)- 1 and slopes[idx+1] >= 0):
                plt.plot([anchor, anchors[idx+1]], [errors[idx], errors[idx+1]], color='red')
                non_monotone_flag = True
            
        if idx > 0 and errors[idx] > errors[idx - 1]:
            plt.plot([anchor, anchors[idx-1]], [errors[idx], errors[idx-1]], color='red')
        if idx < len(anchors) - 1 and errors[idx] < errors[idx + 1]:
            plt.plot([anchor, anchors[idx+1]], [errors[idx], errors[idx+1]], color='red')
#                 non_monotone_flag = True
    
    if(max_violation_anchor > 0):
        plt.scatter(max_violation_anchor, errors[anchors.index(max_violation_anchor)], color='yellow', label='max violation')
    
    plt.title(f'LC of learner {learner_name} on dataset_id {openml_id}')
    red_patch = mpatches.Patch(color='red', label='non-monotone', linewidth=2)
    handles, labels = ax.get_legend_handles_labels()
    handles.append(red_patch)
    plt.legend(handles=handles)
    
    
#     plt.legend()
    if non_monotone_flag == True:
        plt.savefig('plots/non-monotone/' + str(openml_id) + ' ' + learner_name + '.png')
    else:
        plt.savefig('plots/monotone/' + str(openml_id) + ' ' + learner_name + '.png')
    print(slopes)
    return

In [5]:
openml_id = 60
learner_name = 'sklearn.linear_model.LogisticRegression'

curve = lcdb.get_curve(openml_id, learner_name) # Default metric == accuracy
errors = [[1-y for y in x] for x in curve[1]] # Transform accuracy in error rate

slopes = calc_slopes_by_train(curve[0], openml_id, learner_name, outer_seeds, inner_seeds)

KeyboardInterrupt: 

In [None]:
plot_monotonicity(curve[0], errors, openml_id, learner_name, slopes)

In [6]:
df_total = pd.read_pickle('data/df_total.gz')
df_total[df_total['curve_model'] == 'exp3'].iloc[0]

openmlid                                                             3
learner                                                     SVC_linear
max_anchor_seen                                                     45
prediction           [0.7560428000087255, 0.7560428000087468, 0.756...
beta                 [-61.52217771734362, 2.2248593245887633, 0.756...
fails_init                                                           0
fails_fit                                                            0
MSE_trn                                                       0.002501
MSE_tst                                                       0.034619
MSE_tst_last                                                  0.045592
L1_trn                                                        0.044906
L1_tst                                                        0.183154
L1_tst_last                                                   0.213524
n                                                                    4
curve_

In [7]:
data = df_total[(df_total['curve_model'] == 'exp3') & (df_total['max_anchor_seen'] == 2048)]

x = [x[0] for x in data['beta']]
y = [x[1] for x in data['beta']]
z = [x[2] for x in data['beta']]

betas = pd.DataFrame(zip(x,y,z), columns=['a', 'b', 'c'])

def gen_params(idx):
    a = betas['a'].iloc[idx]
    b = betas['b'].iloc[idx]
    c = betas['c'].iloc[idx]
    return a,b,c


# TODO - Add noise 
# TODO - for each LC, do 125 "measurements" with diff noise values.
# TODO - Ablation experiment
def run_experiment():
    n = len(betas) # number of LCs considered
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    asc = 0
    desc = 0
    
    for i in tqdm(range(0,n)):
        is_asc = False
        correct = False
        flag = False
        final_slopes = []
        lern = data['learner'].iloc[i]
        dataset = data['openmlid'].iloc[i]
        a,b,c = gen_params(i)
        anch = data['anchor_prediction'].iloc[i]
        
        if (a < 0 and b > 0) or (a > 0 and b < 0):
            is_asc = True
            asc+=1
        else:
            desc+=1
        
#         first_derivative = lambda x: a*b*np.exp(-b*x)
        for j in range(0, len(anch)):
            slopes = []
            
            for iteration in range(0,125):
                noise = np.random.normal(0, 0.5)
                exp3 = lambda x: 1 - (a * np.exp(-b * x) + c + noise) # exp3 from LCDB
                errs = [exp3(x) for x in anch]
                
                points = range(anch[j] - 10, anch[j] + 11)
                error_lr = [exp3(x) for x in points]
                
                model = LinearRegression()
                model.fit(np.array(points).reshape(-1,1), np.array(error_lr).reshape(-1,1))
                
                slopes.append(model.coef_[0])
                
            final_slopes.append(slopes)
        
        for j in range(0, len(anch) - 1):
            if np.mean(final_slopes[j]) > 0 and np.mean(final_slopes[j+1]) > 0:
                if is_asc:
                    true_pos+=1
                else:
                    false_pos+=1
                correct = True
                flag = True
#                 plt.subplots()
#                 plt.scatter(anch, errs)
#                 plt.plot(anch, errs, color='red')
#                 plt.yticks(np.arange(0,1.05, 0.1))
#                 plt.title(f"learner: {lern}, id: {dataset}, is_asc: {is_asc}, found_asc: {correct}")
#                 plt.show()
                break
        if flag == False:
            if is_asc:
                false_neg+=1
            else:
                true_neg+=1
#             plt.subplots()
#             plt.scatter(anch, errs)
#             plt.plot(anch, errs, color='green')
#             plt.yticks(np.arange(0,1.05, 0.1))
#             plt.title(f"learner: {lern}, id: {dataset}, is_asc: {is_asc}, found_asc: {correct}")
#             plt.show()

    print(f"Out of {n} Learning curves:")
    print(f"-----------{asc} Non-monotonic LCs")
    print(f"-----------{desc} Monotonic LCs\n")
    print(f" - {true_pos} Have been classified correctly as non-monotonic ({round((true_pos/asc)*100,2)}%)")
    print(f" - {true_neg} Have been classified correctly as monotonic ({round((true_neg/desc)*100,2)}%)")
    print(f" - {false_pos} Have been classified incorrectly as non-monotonic (Type I Error)")
    print(f" - {false_neg} Have been classified incorrectly as monotonic (Type II Error)")
    return
        
run_experiment()

100%|█████████████████████████████████████████████████████████████████████████████████████| 2404/2404 [38:31<00:00,  1.04it/s]

Out of 2404 Learning curves:
-----------451 Non-monotonic LCs
-----------1953 Monotonic LCs

 - 448 Have been classified correctly as non-monotonic (99.33%)
 - 938 Have been classified correctly as monotonic (48.03%)
 - 1015 Have been classified incorrectly as non-monotonic (Type I Error)
 - 3 Have been classified incorrectly as monotonic (Type II Error)



