# Real-world data experiment

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import Ridge,Lasso,LinearRegression
from src.thirdparties.mord.regression_based import OrdinalRidge

In [2]:
np.random.seed(0)

# Loading data

In [3]:
n_averaging_locations = 2
n_bins = 5

In [4]:
df_LRCS = joblib.load('data/batteries/dfs/df_LRCS_bins_'+str(n_bins)+'.pkl')
df_LRCSWMG = joblib.load('data/batteries/dfs/df_LRCSWMG_20221026_21457_17512.pkl')

In [5]:
n_CG_LRCS = len(df_LRCS['Comma gap (µm)'].unique())
n_ML_LRCS = n_bins
dom_CG_LRCS = [75,100,200]
dom_ML_LRCS = np.arange(n_ML_LRCS)

# Learning before abstraction

In [6]:
def eval_metric(model,Xte,yte,Xout,roundpred=True):
    if roundpred:
        preds = np.round(model.predict(Xte.reshape(-1,1)))
    else:
        preds = model.predict(Xte.reshape(-1,1))
    mses = (preds - yte)**2
    print('MSE (with {0} out): {1} ({2})'.format(Xout,np.mean(mses),np.std(mses)))
    print('${0:.2f}\pm{1:.2f}$'.format(np.mean(mses),np.std(mses)))
    
    return np.mean(mses)

## Learning on LRCS data: standard KFold

### Ordinal Regression 

In [7]:
scores = []

kfold = KFold()
for tr,te in kfold.split(df_LRCS):
    Xtr,Xte = np.array(df_LRCS['Comma gap (µm)'].iloc[tr]),np.array(df_LRCS['Comma gap (µm)'].iloc[te])
    ytr,yte = np.array(df_LRCS['binned ML'].iloc[tr]),np.array(df_LRCS['binned ML'].iloc[te])
    model = OrdinalRidge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,'None',False)
    scores.append(score)

print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with None out): 0.07692307692307693 (0.26646935501059654)
$0.08\pm0.27$
MSE (with None out): 0.15384615384615385 (0.36080121229410994)
$0.15\pm0.36$
MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
Overall MSE: 0.046153846153846156 (0.06153846153846154)
$0.05\pm0.06$


### Ridge

In [8]:
scores = []

for tr,te in kfold.split(df_LRCS):
    Xtr,Xte = np.array(df_LRCS['Comma gap (µm)'].iloc[tr]),np.array(df_LRCS['Comma gap (µm)'].iloc[te])
    ytr,yte = np.array(df_LRCS['binned ML'].iloc[tr]),np.array(df_LRCS['binned ML'].iloc[te])
    model = Ridge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,'None',True)
    scores.append(score)

print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with None out): 0.07692307692307693 (0.26646935501059654)
$0.08\pm0.27$
MSE (with None out): 0.15384615384615385 (0.36080121229410994)
$0.15\pm0.36$
MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
Overall MSE: 0.046153846153846156 (0.06153846153846154)
$0.05\pm0.06$


## Learning on LRCS data: one-X out

We define the input data ($X$) and the output data ($y$).

In [9]:
def select_one_X_out(X,y,cond):
    Xte = np.array(X[cond])
    yte = np.array(y[cond])

    Xtr = np.array(X[np.logical_not(cond)])
    ytr = np.array(y[np.logical_not(cond)])
    
    return Xtr,ytr,Xte,yte

### Ordinal Regression 

In [10]:
scores = []

for cg in dom_CG_LRCS:
    cond = df_LRCS['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCS['Comma gap (µm)'],df_LRCS['binned ML'],cond)
    
    model = OrdinalRidge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 1.375 (0.9921567416492215)
$1.38\pm0.99$
MSE (with 100 out): 1.0 (0.0)
$1.00\pm0.00$
MSE (with 200 out): 3.875 (0.5994789404140899)
$3.88\pm0.60$
Overall MSE: 2.0833333333333335 (1.2761160692594629)
$2.08\pm1.28$


### Ridge

In [11]:
scores = []

for cg in dom_CG_LRCS:
    cond = df_LRCS['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCS['Comma gap (µm)'],df_LRCS['binned ML'],cond)
    
    model = Ridge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,True)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 1.375 (0.9921567416492215)
$1.38\pm0.99$
MSE (with 100 out): 1.0 (0.0)
$1.00\pm0.00$
MSE (with 200 out): 4.208333333333333 (0.9991315673568165)
$4.21\pm1.00$
Overall MSE: 2.194444444444444 (1.4322401084996403)
$2.19\pm1.43$


### Lasso

In [21]:
scores = []

for cg in dom_CG_LRCS:
    cond = df_LRCS['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCS['Comma gap (µm)'],df_LRCS['binned ML'],cond)
    
    model = Lasso().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,True)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 1.375 (0.9921567416492215)
$1.38\pm0.99$
MSE (with 100 out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with 200 out): 4.208333333333333 (0.9991315673568165)
$4.21\pm1.00$
Overall MSE: 1.861111111111111 (1.7520931045220114)
$1.86\pm1.75$


# Learning after abstraction

## Learning on LRCS+WMG data: standard KFold

### Ordinal Regression

In [12]:
scores = []

kfold = KFold()
for tr,te in kfold.split(df_LRCS):
    Xtr,Xte = np.array(df_LRCS['Comma gap (µm)'].iloc[tr]),np.array(df_LRCS['Comma gap (µm)'].iloc[te])
    ytr,yte = np.array(df_LRCS['binned ML'].iloc[tr]),np.array(df_LRCS['binned ML'].iloc[te])
    model = OrdinalRidge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,'None',False)
    scores.append(score)

print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with None out): 0.07692307692307693 (0.26646935501059654)
$0.08\pm0.27$
MSE (with None out): 0.15384615384615385 (0.36080121229410994)
$0.15\pm0.36$
MSE (with None out): 0.0 (0.0)
$0.00\pm0.00$
Overall MSE: 0.046153846153846156 (0.06153846153846154)
$0.05\pm0.06$


### Ridge

In [13]:
scores = []

for tr,te in kfold.split(df_LRCSWMG):
    Xtr,Xte = np.array(df_LRCSWMG['Comma gap (µm)'].iloc[tr]),np.array(df_LRCSWMG['Comma gap (µm)'].iloc[te])
    ytr,yte = np.array(df_LRCSWMG['binned ML'].iloc[tr]),np.array(df_LRCSWMG['binned ML'].iloc[te])
    model = Ridge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,'None',False)
    scores.append(score)

print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with None out): 0.13164812394327538 (0.370136572219468)
$0.13\pm0.37$
MSE (with None out): 0.5915565512889581 (1.425455088899457)
$0.59\pm1.43$
MSE (with None out): 0.0001680010795951243 (2.710505431213761e-20)
$0.00\pm0.00$
MSE (with None out): 0.22041611899624744 (0.8720870919903739)
$0.22\pm0.87$
MSE (with None out): 0.23215033858235312 (0.6777670366670484)
$0.23\pm0.68$
Overall MSE: 0.23518782677808586 (0.19656087404634068)
$0.24\pm0.20$


## Learning on LRCS+WMG data: one-X out with WMG providing the missing support

We define the input data ($X$) and the output data ($y$) considering first the possibility that WMG offers support for the out-of-sample data ($CG=100$).

### Ordinal Regression 

In [14]:
scores = []

for cg in dom_CG_LRCS:
    cond0 = df_LRCS['Comma gap (µm)']==cg
    cond1 = list(cond0) + [False]*(len(df_LRCSWMG)-len(cond0))
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond1)
    
    model = OrdinalRidge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 1.375 (0.9921567416492215)
$1.38\pm0.99$
MSE (with 100 out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with 200 out): 0.041666666666666664 (0.19982631347136331)
$0.04\pm0.20$
Overall MSE: 0.47222222222222227 (0.6385868851429249)
$0.47\pm0.64$


### Ridge

In [15]:
scores = []

for cg in dom_CG_LRCS:
    cond0 = df_LRCS['Comma gap (µm)']==cg
    cond1 = list(cond0) + [False]*(len(df_LRCSWMG)-len(cond0))
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond1)
    
    model = Ridge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 0.5466692789717083 (0.6854363128035611)
$0.55\pm0.69$
MSE (with 100 out): 0.00010349416644844821 (0.0)
$0.00\pm0.00$
MSE (with 200 out): 0.11555391895234014 (0.29307740386167275)
$0.12\pm0.29$
Overall MSE: 0.22077556403016563 (0.2352122944981268)
$0.22\pm0.24$


In [19]:
scores = []

for cg in dom_CG_LRCS:
    cond = df_LRCSWMG['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond)
    
    model = Lasso().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 0.5607826658850772 (0.38584886995658146)
$0.56\pm0.39$
MSE (with 100 out): 0.5512486399325527 (1.405299836257161)
$0.55\pm1.41$
MSE (with 200 out): 2.5626978817343646 (0.6969472297309359)
$2.56\pm0.70$
Overall MSE: 1.224909729183998 (0.9459670819645568)
$1.22\pm0.95$


### Lasso

In [20]:
scores = []

for cg in dom_CG_LRCS:
    cond0 = df_LRCS['Comma gap (µm)']==cg
    cond1 = list(cond0) + [False]*(len(df_LRCSWMG)-len(cond0))
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond1)
    
    model = Lasso().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 0.5781093959085035 (0.7008871711757073)
$0.58\pm0.70$
MSE (with 100 out): 0.0004050509189137569 (0.0)
$0.00\pm0.00$
MSE (with 200 out): 0.06668892415658287 (0.24854923900700404)
$0.07\pm0.25$
Overall MSE: 0.21506779032800005 (0.25813148264621427)
$0.22\pm0.26$


## Learning on LRCS+WMG data: one X-out with WMG not providing the missing support

### Ordinal Regression 

In [16]:
scores = []

for cg in dom_CG_LRCS:
    cond = df_LRCSWMG['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond)
    
    model = OrdinalRidge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 1.0789473684210527 (0.480218094497228)
$1.08\pm0.48$
MSE (with 100 out): 0.26903553299492383 (0.994250264053218)
$0.27\pm0.99$
MSE (with 200 out): 0.029411764705882353 (0.16895772489817729)
$0.03\pm0.17$
Overall MSE: 0.45913155537395295 (0.44906096034554)
$0.46\pm0.45$


### Ridge

In [17]:
scores = []

for cg in dom_CG_LRCS:
    cond = df_LRCSWMG['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond)
    
    model = Ridge().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 0.522634683585434 (0.3773472391605363)
$0.52\pm0.38$
MSE (with 100 out): 0.5683329594068245 (1.420643441232057)
$0.57\pm1.42$
MSE (with 200 out): 7.9793165139722415 (1.1118429267436363)
$7.98\pm1.11$
Overall MSE: 3.0234280523215005 (3.504391998273545)
$3.02\pm3.50$


### Lasso

In [19]:
scores = []

for cg in dom_CG_LRCS:
    cond = df_LRCSWMG['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond)
    
    model = Lasso().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores),np.std(scores)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores),np.std(scores)))

MSE (with 75 out): 0.5607826658850772 (0.38584886995658146)
$0.56\pm0.39$
MSE (with 100 out): 0.5512486399325527 (1.405299836257161)
$0.55\pm1.41$
MSE (with 200 out): 2.5626978817343646 (0.6969472297309359)
$2.56\pm0.70$
Overall MSE: 1.224909729183998 (0.9459670819645568)
$1.22\pm0.95$
