In [3]:
# Imports
import numpy as np
import json
import lcdb
from sklearn.linear_model import LinearRegression, LogisticRegression
import sklearn.model_selection
from directencoder import DirectEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import pandas as pd
import scipy
from tqdm import tqdm

In [4]:
# Data Init
db = pd.read_csv('data/database-accuracy.csv')
monotonicity_violations = pd.read_csv('data/monotonicityviolations_logloss.csv')
encoder = DirectEncoder(2)

outer_seed = 0
inner_seed_index = 0
num_seeds = 5
outer_seeds = list(range(0,10))
inner_seeds = list(range(num_seeds * inner_seed_index, num_seeds * (inner_seed_index + 1)))

In [3]:
def calc_slopes_by_train(anchors, openml_id, learner_name, outer_seeds, inner_seeds, plotting=False):
    X, y = lcdb.get_dataset(openml_id)
    slopes = []
    n = len(anchors)
    
    # Code from LCDB paper
    learner_params = {}
    if learner_name == "SVC_linear":
        learner_name = "sklearn.svm.LinearSVC"
    elif learner_name == "SVC_poly":
        learner_name = "sklearn.svm.SVC"
        learner_params = {"kernel": "poly"}
    elif learner_name == "SVC_rbf":
        learner_name = "sklearn.svm.SVC"
        learner_params = {"kernel": "rbf"}
    elif learner_name == "SVC_sigmoid":
        learner_name = "sklearn.svm.SVC"
        learner_params = {"kernel": "sigmoid"}
    
    
    for idx,anchor in enumerate(anchors):
        points = None
        if idx == 0:
            points = range(anchor, anchor+21)
        if idx == n - 1:
            points = range(anchor-20, anchor+1)
        else:
            points = range(anchor-10, anchor+11)
        points = np.array(points).reshape(-1,1)
        err_lin_reg = []
        err_lin_points = []
        for anch in points:
            measurements = []
            for outer_seed in outer_seeds:
                for inner_seed in inner_seeds:
                    try:
                        info = lcdb.get_entry(learner_name, {}, X,y, anch[0], outer_seed, inner_seed)
                        y_pred = encoder.decode_label_vector(info['y_hat_train'])
                        y_true = encoder.decode_label_vector(info['y_train'])
                        
                        err = 1 - metrics.accuracy_score(y_true, y_pred)
                        err_lin_reg.append(err)
                        err_lin_points.append(anch)
                    except Exception as e:
                        print("Invalid " + str(openml_id) + " " + learner_name + " " + e)
#             err_lin_reg.append(np.mean(measurements))
            
        lin_reg = LinearRegression()
        lin_reg.fit(err_lin_points, err_lin_reg)
        
#         a = lin_reg.coef_[0]
#         b=0
#         fst = a + points[0] + b
#         lst = a + points[19] + b

        if plotting == True:
            plt.plot(points, lin_reg.predict(points), color='k')

        slopes.append(lin_reg.coef_[0])
    return slopes

In [4]:
def plot_curve(anchors, errors, color='green'):

    print(anchors)
    print(errors)
    n = min(len(anchors), len(errors))
    plt.scatter(anchors[0:n], errors[0:n], color=color)
    plt.plot(anchors[0:n], errors[0:n], color=color, label='monotone')

def plot_monotonicity(anchors, err, openml_id, learner_name, slopes=None):
    fig,ax = plt.subplots()
    non_monotone_flag = False    
    max_violation_anchor = monotonicity_violations[(monotonicity_violations['openmlid'] == openml_id) & (monotonicity_violations['learner'] == learner_name)]['max_violation_anchor'].iloc[0]
    
    if slopes == None:
        return
    
    errors = [np.mean(x) for x in err]
    plot_curve(anchors, errors)
    
    for idx, anchor in enumerate(anchors):
        if idx >= len(slopes) or idx >= len(errors):
            continue
            
        if slopes[idx] >= 0:
            plt.scatter(anchor, errors[idx], color='red')
#             non_monotone_flag = True
            if (idx < len(anchors)- 1 and slopes[idx+1] >= 0):
                plt.plot([anchor, anchors[idx+1]], [errors[idx], errors[idx+1]], color='red')
                non_monotone_flag = True
            
        if idx > 0 and errors[idx] > errors[idx - 1]:
            plt.plot([anchor, anchors[idx-1]], [errors[idx], errors[idx-1]], color='red')
        if idx < len(anchors) - 1 and errors[idx] < errors[idx + 1]:
            plt.plot([anchor, anchors[idx+1]], [errors[idx], errors[idx+1]], color='red')
#                 non_monotone_flag = True
    
    if(max_violation_anchor > 0):
        plt.scatter(max_violation_anchor, errors[anchors.index(max_violation_anchor)], color='yellow', label='max violation')
    
    plt.title(f'LC of learner {learner_name} on dataset_id {openml_id}')
    red_patch = mpatches.Patch(color='red', label='non-monotone', linewidth=2)
    handles, labels = ax.get_legend_handles_labels()
    handles.append(red_patch)
    plt.legend(handles=handles)
    
    
#     plt.legend()
    if non_monotone_flag == True:
        plt.savefig('plots/non-monotone/' + str(openml_id) + ' ' + learner_name + '.png')
    else:
        plt.savefig('plots/monotone/' + str(openml_id) + ' ' + learner_name + '.png')
    print(slopes)
    return

In [5]:
openml_id = 60
learner_name = 'sklearn.linear_model.LogisticRegression'

curve = lcdb.get_curve(openml_id, learner_name) # Default metric == accuracy
errors = [[1-y for y in x] for x in curve[1]] # Transform accuracy in error rate

slopes = calc_slopes_by_train(curve[0], openml_id, learner_name, outer_seeds, inner_seeds)

KeyboardInterrupt: 

In [None]:
plot_monotonicity(curve[0], errors, openml_id, learner_name, slopes)

In [11]:
pd.set_option('use_inf_as_na',True)
df_total = pd.read_pickle('data/df_total.gz').dropna()
data = df_total[(df_total['curve_model'] == 'exp3')]
# data = data[data['max_anchor_seen'] == 2048]
data = data.loc[data.groupby(['openmlid', 'learner'])['max_anchor_seen'].idxmax()]

q25 = data['MSE_tst_last'].quantile(0.5)
data = data[data['MSE_tst_last'] < q25]

print(data)

x = [x[0] for x in data['beta']]
y = [x[1] for x in data['beta']]
z = [x[2] for x in data['beta']]
# dd = [x[3] for x in data['beta']]

betas = pd.DataFrame(zip(x,y,z), columns=['a', 'b', 'c'])
# betas = betas[(betas['b'] > 1) | (betas['b'] < -1)]

       openmlid                                           learner  \
71           57                                        SVC_linear   
263          57                                          SVC_poly   
455           3                                           SVC_rbf   
455          57                                           SVC_rbf   
647          57                                       SVC_sigmoid   
...         ...                                               ...   
55623      4541  sklearn.linear_model.PassiveAggressiveClassifier   
55815     42810                 sklearn.naive_bayes.MultinomialNB   
56391      1002             sklearn.ensemble.ExtraTreesClassifier   
56391     42810               sklearn.tree.DecisionTreeClassifier   
56583     42810                  sklearn.tree.ExtraTreeClassifier   

       max_anchor_seen                                         prediction  \
71                2048  [0.8433496801948395, 0.8693166769047481, 0.890...   
263              

In [12]:
def gen_params(idx):
    a = betas['a'].iloc[idx]
    b = betas['b'].iloc[idx]
    c = betas['c'].iloc[idx]
#     d = betas['d'].iloc[idx]
    return a,b,c


def run_experiment(ablation=False):
    n = int(len(betas)) # number of LCs considered
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    asc = 0
    desc = 0
    
    for i in tqdm(range(0,n)):
        is_asc = False
        flag = False
        final_slopes = []
        
        lern = data['learner'].iloc[i]
        dataset = data['openmlid'].iloc[i]
        
        a,b,c = gen_params(i)
#         a,b,c,d = gen_params(i)
        exp3 = lambda x: a * np.exp((-b) * x) + c # exp3 from LCDB
#         mmf4 = lambda x: (a*b + c * (x ** d)) / (b + (x ** d))
        anch = data['anchor_prediction'].iloc[i] # array of anchors
        
        if (a < 0 and b > 0) or (a > 0 and b < 0):
            is_asc = True
            asc+=1
        else:
            is_asc = False
            desc+=1
        
        for j in range(0, len(anch)):
            points = range(anch[j] - 10, anch[j] + 11)
            errors_iterations = []
            pp = []
            slopes = []
            for it in range(0, 25):
#                 linreg_errs = []
#                 for p in points:
                noise = np.random.normal(0,0.002)
                linreg_errs = [exp3(p)+noise for p in points]
#                 for it in range(0,125):
#                     pp.append(p)
#                     noise = np.random.normal(0, 0.000001)
#                     err = exp3(p) + noise
#                     err_p.append(err)
#                 errors_iterations.append(exp3(p)+np.random.normal(0,0.000001))
                model = LinearRegression()
                model.fit(np.array(points).reshape(-1,1), np.array(linreg_errs).reshape(-1,1))
                if(model.coef_[0] > 0):
                    slopes.append(1)
                else:
                    slopes.append(-1)
            final_slopes.append(np.mean(slopes))
        
            
            
        for j in range(0, len(anch) - 1):
            if ablation == True:
                if final_slopes[j] > 0:
                    if is_asc:
                        true_pos+=1
                    else:
                        false_pos+=1
                    flag = True
                    break
            else:
                if final_slopes[j] > 0 and final_slopes[j+1] > 0:
                    if is_asc:
                        true_pos+=1
                    else:
                        false_pos+=1
                        
                    flag = True
                    break
                
        if flag == False:
            if is_asc:
                false_neg+=1
            else:
                true_neg+=1


    print(f"Out of {n} Learning curves:")
    print(f"-----------{asc} Non-monotonic LCs")
    print(f"-----------{desc} Monotonic LCs\n")
    print(f" - {true_pos} Have been classified correctly as non-monotonic ({round((true_pos/asc)*100,2)}%)")
    print(f" - {true_neg} Have been classified correctly as monotonic ({round((true_neg/desc)*100,2)}%)")
    print(f" - {false_pos} Have been classified incorrectly as non-monotonic (Type I Error)")
    print(f" - {false_neg} Have been classified incorrectly as monotonic (Type II Error)")
    return final_slopes

# Experiment 2 - Ablation Study

In [None]:
slopes_accuracy = run_experiment(ablation=False) # Run accuracy experiment
slopes_ablation = run_experiment(ablation=True) # Run ablation study 

100%|██████████████████████████████████████████████████████████████████████████████| 3769/3769 [26:01<00:00,  2.41it/s]


Out of 3769 Learning curves:
-----------3179 Non-monotonic LCs
-----------590 Monotonic LCs

 - 3140 Have been classified correctly as non-monotonic (98.77%)
 - 554 Have been classified correctly as monotonic (93.9%)
 - 36 Have been classified incorrectly as non-monotonic (Type I Error)
 - 39 Have been classified incorrectly as monotonic (Type II Error)


 99%|█████████████████████████████████████████████████████████████████████████████▌| 3746/3769 [26:08<00:07,  3.01it/s]

In [17]:
data['MSE_tst_last'].describe()

count     1.885000e+03
mean     3.020320e+217
std                NaN
min       0.000000e+00
25%       1.330105e-06
50%       2.156719e-05
75%       3.057070e-04
max      5.693304e+220
Name: MSE_tst_last, dtype: float64

In [8]:
q25 = data['MSE_trn'].quantile(0.25)
data[data['MSE_trn'] < q25]

for idx in len(data):
    row = data.iloc[idx]
    

Unnamed: 0,openmlid,learner,max_anchor_seen,prediction,beta,fails_init,fails_fit,MSE_trn,MSE_tst,MSE_tst_last,L1_trn,L1_tst,L1_tst_last,n,curve_model,anchor_prediction,score,percentage,percentage_bucket
71,24,SVC_linear,64,"[0.8881939184541461, 0.9127380493945678, 0.933...","[-0.18498168635445061, 0.053821398717900655, 0...",0,0,5.899628e-07,7.844349e-04,1.130185e-03,0.000624,0.026241,0.033618,5,exp3,"[16, 23, 32, 45, 64, 91, 128, 181, 256, 362, 5...","[0.888184, 0.913108, 0.9322480000000002, 0.951...",0.009728,0.05
71,28,SVC_linear,64,"[0.6109346065796966, 0.6786447625631157, 0.740...","[-0.5182087539484821, 0.04305895831083495, 0.8...",0,0,4.208965e-06,2.858538e-03,6.403439e-03,0.001609,0.045723,0.080021,5,exp3,"[16, 23, 32, 45, 64, 91, 128, 181, 256, 362, 5...","[0.6104360000000001, 0.6787279999999999, 0.743...",0.014060,0.05
71,57,SVC_linear,2048,"[0.8433496801948395, 0.8693166769047481, 0.890...","[-0.19730679558849995, 0.059095845730495956, 0...",0,0,1.202539e-05,1.226473e-05,1.226473e-05,0.003086,0.003502,0.003502,15,exp3,"[16, 23, 32, 45, 64, 91, 128, 181, 256, 362, 5...","[0.8418999999999999, 0.8723449999999999, 0.888...",0.670596,0.80
71,390,SVC_linear,64,"[0.1810382484579428, 0.23095369743606212, 0.28...","[-0.44146672878355747, 0.027653688710505483, 0...",0,0,2.516984e-06,8.066324e-02,1.689797e-01,0.001303,0.253921,0.411071,5,exp3,"[16, 23, 32, 45, 64, 91, 128, 181, 256, 362, 5...","[0.18043599999999999, 0.23122399999999999, 0.2...",0.008268,0.05
263,57,SVC_poly,2048,"[0.8904794377226107, 0.8980509131472884, 0.905...","[-0.05865619308689036, 0.040433294301970436, 0...",0,0,6.548127e-06,4.281903e-06,4.281903e-06,0.002367,0.002069,0.002069,15,exp3,"[16, 23, 32, 45, 64, 91, 128, 181, 256, 362, 5...","[0.887335, 0.9016150000000002, 0.9066880000000...",0.670596,0.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53719,41142,sklearn.tree.ExtraTreeClassifier,64,"[0.5338500247518796, 0.5458706737868025, 0.545...","[-196707.78847672566, 1.0381190869250136, 0.54...",0,0,7.421882e-06,1.606092e-03,3.712334e-03,0.002017,0.037224,0.060929,5,exp3,"[16, 23, 32, 45, 64, 91, 128, 181, 256, 362, 5...","[0.5338479999999999, 0.545976, 0.54344, 0.5508...",0.014585,0.05
54855,1499,sklearn.ensemble.RandomForestClassifier,128,"[0.8667875950147348, 0.8685284592536817, 0.870...","[42.73813774966857, -5.818378839915116e-06, -4...",0,0,6.469074e-06,2.218017e-04,2.218017e-04,0.001967,0.014893,0.014893,7,exp3,"[16, 23, 32, 45, 64, 91, 128, 170]","[0.8631440000000002, 0.8673519999999999, 0.875...",0.752941,0.80
55239,1499,sklearn.naive_bayes.BernoulliNB,128,"[0.32236361797457075, 0.32320234751474086, 0.3...","[-0.00733077103272093, 0.028378669457045532, 0...",0,0,1.112467e-05,1.245481e-04,1.245481e-04,0.002301,0.011160,0.011160,7,exp3,"[16, 23, 32, 45, 64, 91, 128, 170]","[0.3263200000000001, 0.31580000000000014, 0.32...",0.752941,0.80
55815,42810,sklearn.naive_bayes.MultinomialNB,2048,"[0.53822470687768, 0.5676194133124589, 0.58390...","[-0.3092072268035676, 0.107097427029717, 0.593...",0,0,2.907140e-06,1.560590e-07,1.560590e-07,0.001223,0.000395,0.000395,15,exp3,"[16, 23, 32, 45, 64, 91, 128, 181, 256, 362, 5...","[0.5371159999999999, 0.5715519999999998, 0.580...",0.632099,0.80
