Import libraries

In [1]:
import os

import pandas as pd

import numpy as np

from scipy import stats
import scipy.stats as sc
from scipy.cluster.hierarchy import linkage, leaves_list

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from learner.mlearner import learn_with_interactions, learn_without_interactions, sample_random, stepwise_feature_selection
from learner.model import genModelTermsfromString, Model, genModelfromCoeff


from import_data import load_data

In [2]:
!conda list

# packages in environment at /home/llesoil/anaconda3/envs/x264:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main  
_tflow_select             2.3.0                       mkl  
absl-py                   0.9.0                    py37_0  
asn1crypto                1.3.0                    py37_0  
astor                     0.8.0                    py37_0  
attrs                     19.3.0                     py_0  
backcall                  0.1.0                    py37_0  
blas                      1.0                         mkl  
bleach                    3.1.4                      py_0  
blinker                   1.4                      py37_0  
c-ares                    1.15.0            h7b6447c_1001  
ca-certificates           2020.6.20            hecda079_0    conda-forge
cachetools                3.1.1                      py_0  
certifi                   2020.6.20        py37hc8dfbb8_0 

Import data

In [3]:
listVideo = load_data(drop_default=True)

nbVideos = len(listVideo)

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true-y_pred)/y_true))

predDimension="etime"

   Unnamed: 0  configurationID no_8x8dct no_asm no_cabac no_deblock  \
0           0                1      True  False    False       True   
1           1             1001     False  False     True       True   
2           2             1002      True  False    False      False   
3           3             1003     False  False    False       True   
4           4             1004     False  False     True      False   

  no_fast_pskip no_mbtree no_mixed_refs no_weightb  rc_lookahead  ref  frames  \
0          True     False          True       True          20.0  9.0    1374   
1          True      True          True       True          20.0  1.0    1374   
2          True     False          True       True          60.0  5.0    1374   
3          True      True         False      False          20.0  9.0    1374   
4         False      True          True      False          20.0  9.0    1374   

     cpu       fps     kbs  etime       size  
0  703.2  1315.615  225.03  1.052  1289

# L2s implementation

### Step 1: Extraction Process of Performance Models

Select a good model for predicting the performance of the source video

Original files:
- https://github.com/cmu-mars/model-learner/blob/tutorial/learner/mlearner.py for the stepwise selection
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html for the interactions

In [4]:
# @PooyanJamshidi:
# We just change slightly some functions from the original repository,
# mainly because we don't want to add a constant in the model
# + steps 2 and 3 were implemented in matlab but we did not find them in python

def stepwise_selection(X, y,
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    
    ndim = X.shape[1]
    features = [i for i in range(ndim)]
    included = list(initial_list)
    
    while True:
        changed=False
        
        # forward step (removed a constant)
        excluded = list(set(features)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, pd.DataFrame(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add {:30} with p-value {:.5}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, pd.DataFrame(X[included])).fit()
        pvalues = model.pvalues
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.5}'.format(worst_feature, worst_pval))
        if not changed:
            if verbose:
                print("Construction of the model completed!")
            break
    return included

In [5]:
# to sample the source and the target using the same seed
random_state = np.random.randint(0,1000)

# a list of features to keep as explicative variables
keep_features = ['no_8x8dct','no_asm', 'no_cabac','no_deblock','no_fast_pskip', 
                 'no_mbtree','no_mixed_refs','no_weightb','rc_lookahead','ref']

# ordinal data to convert into dummies
to_dummy_features = ['rc_lookahead','ref']


# percentage of configuration used for test
pct_test = 0.7

# the source video
source = listVideo[1]


# transform some variables into dummies, to fit the orginal paper
# since we don't want to introduce a meaningless constant in the model, 
# we have to keep all columns

dummies = pd.get_dummies(source[keep_features], 
                   drop_first = False,
                   columns=to_dummy_features)

X_src = pd.DataFrame(np.array(dummies, dtype=int))


# add interactions
poly = PolynomialFeatures(degree=2, interaction_only = True, include_bias = True)
X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src),int))

# performance variable, to predict
y_src = np.array(source[predDimension], dtype=float)

# split train test
X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_interact, 
                                                                    y_src, 
                                                                    test_size=pct_test, 
                                                                    random_state=random_state)

# the index of the selected features
selected_features = stepwise_selection(X_interact, y_src)

Add                              0 with p-value 0.0


  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Add                             12 with p-value 3.6016e-115
Add                              7 with p-value 2.6613e-135
Add                             82 with p-value 6.4634e-110
Add                              6 with p-value 2.4987e-76
Add                             13 with p-value 2.8128e-87
Add                             14 with p-value 0.0
Add                              3 with p-value 8.6971e-72
Add                             84 with p-value 1.7216e-79
Add                              1 with p-value 3.3535e-85
Add                             83 with p-value 3.7657e-74
Add                             75 with p-value 7.1947e-58
Add                             70 with p-value 6.7114e-69
Add                              9 with p-value 1.3804e-79
Add                             72 with p-value 3.0565e-115
Add                             42 with p-value 3.7652e-66
Add                             10 with p-value 6.438e-24
Add                             11 with p-value 0.0
Add     

### Step 2: Active Sampling

#### A - ] Exploitation : use the source's prediction model

##### (i) Sort the coefficients of the previous constructed model

##### (ii) Choose the coefficient with the highest value

##### (iii) Select the configurations with this feature activated



I assumed it was recursive, with a decreasing influence in the selection for a decreasing importance in the regression.

In [6]:
ratio_exploitation = 0.3
config_tot = 200

reg = LinearRegression()

reg.fit(X_interact[selected_features], y_src)

sorted_coefs = pd.Series(np.abs(reg.coef_), selected_features).sort_values(ascending=False).index

nb_config_exploitation = np.round(ratio_exploitation*config_tot)

nb_config_selected = 0

assert X_interact.shape[0] >= nb_config_exploitation ; " Too many configurations to select ! "

def select_exploitation(df, sc, config_selected):
    
    #number of config left to choose
    nb_config = int(nb_config_exploitation - len(config_selected))
    
    if nb_config == 0:
        print("Done!\n")
        return config_selected
    
    # if we don't have any important coefficient left to help us choose configs
    # we take the nb_config first configurations
    if len(sc) == 0:
        print("Selecting " + str(nb_config) + " configurations from the rest of the dataset!")
        for conf in df.index[0:nb_config]:
            config_selected.append(conf)
        return config_selected
    
    # otherwise we just use the best coef to choose configs
    else:
        
        # we choose the best features coef (biggest absolute value)
        most_important_coef = sc[0]
        
        print("Feature : " + str(most_important_coef))
        
        # configs with this feature activated
        imp_index = np.where(df[most_important_coef]==1)[0]

        # number of configs with this feature activated
        nb_imp_index = len(imp_index)

        # if we have more values to choose 
        # than the number of configurations with the best feature activated
        # we add all the configuration to the selected set
        # and we select the rest of the configuration based on other coefficients
        if nb_imp_index <= nb_config:
            for conf in df.iloc[imp_index].index:
                config_selected.append(conf)
            if nb_imp_index > 0:
                print("Added "+str(nb_imp_index)+ " values, "+str(nb_config-nb_imp_index)+" left to choose \n")
            # then we apply recursively this method to the rest of the dataframe
            return select_exploitation(df.iloc[np.where(df[most_important_coef]==0)[0]], 
                                          sc[1:len(sc)],
                                          config_selected)
        
        # otherwise we have enough values with this features activated
        # to select all the remaining configurations
        # so we apply the method to the dataframe containing all the feature activated
        # and we select the configuration by using the followings features
        else:
            return select_exploitation(df.iloc[imp_index], 
                                 sc[1:len(sc)], 
                                 config_selected)

exploitation_conf = select_exploitation(X_interact, sorted_coefs, [])

print("Selected : " + str(exploitation_conf))

Feature : 6
Feature : 72
Feature : 73
Feature : 74
Feature : 12
Feature : 13
Feature : 14
Feature : 11
Feature : 10
Feature : 9
Feature : 7
Added 32 values, 28 left to choose 

Feature : 82
Feature : 83
Feature : 84
Feature : 0
Feature : 3
Added 16 values, 12 left to choose 

Feature : 75
Feature : 70
Feature : 1
Added 8 values, 4 left to choose 

Feature : 42
Feature : 50
Feature : 16
Feature : 93
Feature : 77
Feature : 27
Feature : 19
Feature : 43
Feature : 45
Feature : 79
Feature : 4
Added 4 values, 0 left to choose 

Done!

Selected : [1, 13, 76, 116, 125, 128, 163, 205, 241, 280, 367, 378, 412, 431, 455, 466, 519, 577, 618, 682, 750, 765, 790, 804, 817, 835, 875, 900, 942, 1005, 1009, 1100, 21, 137, 195, 287, 405, 407, 514, 616, 641, 778, 832, 848, 874, 897, 973, 1110, 5, 620, 633, 949, 1018, 1075, 1098, 1150, 12, 625, 747, 1030]


#### B-] Exploration : Select specific configurations, similar between the source and the target

In [7]:
ratio_exploration = 1-ratio_exploitation
nb_exploration = int(config_tot*ratio_exploitation)

# I choose to select the group in one step:
# if you select config per config, you may choose a local optimal

def select_exploration(df, exploitation_conf, id_target, number_group = 100):
    
    target = listVideo[id_target]
    
    # all the config left for exploration
    # total minus those chosen for exploitation
    explor_conf = np.setdiff1d(df.index, exploitation_conf)
    
    # initialization : we take the first nb_exploration config
    best_explor = explor_conf[0:nb_exploration]
    
    # we group it with the exploitation configurations
    conf = np.concatenate((exploitation_conf, best_explor), axis=0)
    # for the moment, it's our best entropy
    best_entropy  = sc.entropy(target.iloc[conf][predDimension], source.iloc[conf][predDimension])
    
    # then we incrementally select the configurations to diminish the entropy 
    group_counter = 0
    
    while group_counter < number_group:
        
        group_counter +=1
        
        # current group to 'challenge' the best result
        np.random.shuffle(explor_conf)
        current_explor = explor_conf[0:nb_exploration]
        
        # we group it with the exploitation configurations
        conf = np.concatenate((exploitation_conf, current_explor), axis=0)
        
        # we compute the Kullback Leibler divergence between the source and the target
        current_entropy = sc.entropy(target.iloc[conf][predDimension], source.iloc[conf][predDimension])
        
        # we finally take the group giving the lowest entropy
        # if this group is better than the best group, we replace it by the new one
        if current_entropy > best_entropy:
            print("Entropy gained : "+str(current_entropy-best_entropy))
            best_entropy = current_entropy
            best_explor = current_explor
    
    return best_explor

print("\nConfigurations kept for exploration : \n" + 
      str(select_exploration(X_interact, exploitation_conf, 0, 1000)))

Entropy gained : 0.000547608540106108
Entropy gained : 0.0002969061712895637
Entropy gained : 0.00014167539197548762
Entropy gained : 0.0004862862461085379
Entropy gained : 0.0002133764349618152
Entropy gained : 3.961097664649787e-05

Configurations kept for exploration : 
[ 522  970  261  260  880  352 1115 1029  208  842   87   52 1151  289
   98  161 1021   26  319  686  828 1044  575  669  239  442  703  584
  583  769  546  450  443  906  349  696  800  707  283 1102  211  240
  602 1004  570  713  408   35  353 1127 1117  492  253  104  679  909
   88  475  318  659]


### Step 3 : Transfer the knowledge

In [8]:
def l2s_transfer(i, j, ratio_exploitation = 0.3, l2s_tr_ratio = 0.5, pct_test = 0.7):
    
    # to sample the source and the target using the same seed
    random_state = np.random.randint(0,1000)

    # a list of features to keep as explicative variables
    keep_features = ['no_8x8dct','no_asm', 'no_cabac','no_deblock','no_fast_pskip', 
                     'no_mbtree','no_mixed_refs','no_weightb','rc_lookahead','ref']

    # ordinal data to convert into dummies
    to_dummy_features = ['rc_lookahead','ref']

    # the source video
    source = listVideo[i]
    
    # the number of config used in the training
    config_tot = int(l2s_tr_ratio*(1-pct_test)*source.shape[1])

    # transform some variables into dummies, to fit the orginal paper
    # since we don't want to introduce a meaningless constant in the model, 
    # we have to keep all columns

    dummies = pd.get_dummies(source[keep_features], 
                       drop_first = False,
                       columns=to_dummy_features)

    X_src = pd.DataFrame(np.array(dummies, dtype=int))


    # add interactions
    poly = PolynomialFeatures(degree=2, interaction_only = True, include_bias = True)
    X_interact = pd.DataFrame(np.array(poly.fit_transform(X_src),int))

    # performance variable, to predict
    y_src = np.array(source[predDimension], dtype=float)

    # split train test (-> we only use X_src_train to sample l2s)
    X_src_train, X_src_test, y_src_train, y_src_test = train_test_split(X_interact, 
                                                                        y_src, 
                                                                        test_size=pct_test, 
                                                                        random_state=random_state)

    # we train the model with the training data
    print("\n############### I- Stepwise regression #################\n")
    
    selected_features = stepwise_selection(X_src_train, y_src_train)
    
    print("\n############### II- Sampling #################\n")
    
    reg = LinearRegression()

    reg.fit(X_src_train[selected_features], y_src_train)

    sorted_coefs = pd.Series(np.abs(reg.coef_), selected_features).sort_values(ascending=False).index

    nb_config_exploitation = np.round(ratio_exploitation*config_tot)
    
    print("A- EXPLOITATION\n")
    
    exploitation_conf = select_exploitation(X_src_train, sorted_coefs, [])
    
    print("\nB- EXPLORATION\n")
    
    exploration_conf = select_exploration(X_src_train, exploitation_conf, j, 1000)
    
    sampled_conf=np.concatenate((exploitation_conf,exploration_conf), axis=0)
    
    print(sampled_conf)
    
    print("\n############### III- Transfer #################\n")
    
    # we split the source and the target
    
    target = listVideo[j]
    
    _, X_src_te, _, y_src_te = train_test_split(source[keep_features], 
                                                                    source[predDimension], 
                                                                    test_size=pct_test, 
                                                                    random_state=random_state)
    
        
    _, X_tgt_te, _, y_tgt_te = train_test_split(target[keep_features], 
                                                                    target[predDimension],  
                                                                    test_size=pct_test, 
                                                                    random_state=random_state)
    X_src_tr = source[keep_features].iloc[sampled_conf]
    y_src_tr = source[predDimension].iloc[sampled_conf]
    
    X_tgt_tr = target[keep_features].iloc[sampled_conf]
    y_tgt_tr = target[predDimension].iloc[sampled_conf]
    
    lf = LinearRegression()
    lf.fit(X_src_tr, y_src_tr)
    y_src_pred_te = np.array(lf.predict(X_src_te)).reshape(-1,1)
    
    # The shift function, to transfer the prediction from the source to the target
    shift = LinearRegression()
    shift.fit(np.array(y_src_tr).reshape(-1,1), y_tgt_tr)
    y_tgt_pred_te = shift.predict(y_src_pred_te)
    
    # We return the mean average percentage error 
    # between the real values of y_test from target 
    # and the predictions shifted 
    return min(mape(y_tgt_te, y_tgt_pred_te),1)
    
    
l2s_transfer(1,2)


############### I- Stepwise regression #################

Add                              0 with p-value 5.1783e-205
Add                             12 with p-value 1.1809e-35


  return self.params / self.bse
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Add                              7 with p-value 1.1977e-44
Add                             82 with p-value 7.6015e-29
Add                              6 with p-value 1.9188e-25
Add                             13 with p-value 8.3363e-32
Add                             14 with p-value 2.7087e-198
Add                             84 with p-value 1.1647e-20
Add                              3 with p-value 7.8187e-29
Add                              1 with p-value 9.6967e-23
Add                             83 with p-value 5.5034e-21
Add                             75 with p-value 5.4457e-18
Add                             70 with p-value 2.1972e-22
Add                              9 with p-value 1.1661e-18
Add                             72 with p-value 5.6538e-35
Add                             42 with p-value 1.1999e-19
Add                             11 with p-value 1.4369e-07
Add                             10 with p-value 4.9131e-253
Add                             73 with p-value 1.7055

0.14900471315052735