In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, LeavePOut
from sklearn.neighbors import KNeighborsClassifier

import sys
sys.path.insert(0,'../')
import common.datautils as datautils
import common.featuresutils as featuresutils

In [None]:
ce_data = datautils.load_ce_results('CE.results.zip')
ce_data.head()

In [None]:
benchmarks = ce_data["Benchmark"].unique()
print(benchmarks)

In [None]:
X, y = featuresutils.load_features(benchmarks, with_dwarf=False, with_names=True)
X = StandardScaler().fit_transform(X)

In [None]:
cross_apply_data = datautils.load_csv_results(
    'cross_apply.results.csv',
    ['ReferenceBenchmark','ApplyToBenchmark', 'Success'], 
    benchmark_cols=['ReferenceBenchmark', 'ApplyToBenchmark'],
    successful_only=False
)
cross_apply_data.head()

In [None]:
datautils.best_configuration_data('Energy', 'botsspar', ce_data, ['Energy', 'Time'])

In [None]:
def percentage_difference(best_value, measured_value):
    difference = np.abs(best_value - measured_value) / best_value
    return np.round(difference * 100)

def calculate_percent_of_best():
    for benchmark in benchmarks:
        best_config_data = datautils.best_configuration_data('Energy', benchmark, ce_data, ['Energy', 'Time'])

        benchmark_ca_data = cross_apply_data[cross_apply_data["ApplyToBenchmark"] == benchmark]
        cross_apply_data.loc[benchmark_ca_data.index, 'EnergyPercentOfBest'] = percentage_difference(best_config_data[0], benchmark_ca_data["Energy"])
        cross_apply_data.loc[benchmark_ca_data.index, 'TimePercentOfBest'] = percentage_difference(best_config_data[1], benchmark_ca_data["Time"])
        

calculate_percent_of_best()
        
cross_apply_data.head()

In [None]:
cross_apply_data[cross_apply_data['ReferenceBenchmark'] == 'botsspar']

In [None]:
def plot_percentage_difference(data):
    plt.figure(figsize=(14,14))
    
    points = np.arange(0, len(benchmarks))
    
    for i, reference_benchmark in enumerate(benchmarks):
        for j, apply_to_benchmark in enumerate(benchmarks):
            percentage_difference = data[(data["ReferenceBenchmark"] == reference_benchmark)
                                         & (data["ApplyToBenchmark"] == apply_to_benchmark)]["EnergyPercentOfBest"].iloc[0]
            
            if i == j:
                percentage_difference = 0
            
            if percentage_difference > 100:
                percentage_difference = 100
            
            plt.scatter(i, j, marker='s', c='k', s=percentage_difference)
#     plt.scatter(points, points)

    plt.title("Percentage difference ")
    plt.xticks(points, benchmarks, rotation='vertical')
    plt.yticks(points, benchmarks)
    
plot_percentage_difference(cross_apply_data)

### Leave P out Cross Validation

In [None]:
def add_value(dict, key, value):
    if not key in dict.keys():
        dict[key] = []
        
    dict[key].append(value)
    

def leave_p_out(p, data):
    lpo = LeavePOut(p)
    
    benchmarks_energy = {}
    benchmarks_time = {}
    
    for train_index, test_index in lpo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        nn = KNeighborsClassifier(n_neighbors=1)
        nn.fit(X_train, y_train)
        predictions = nn.predict(X_test)
        
        for i in range(len(y_test)):
            apply_to_benchmark = y_test[i]
            predicted_benchmark = predictions[i]
            
            energy = data[(data["ReferenceBenchmark"] == predicted_benchmark) 
                          & (data["ApplyToBenchmark"] == apply_to_benchmark)]["Energy"].iloc[0]
            
            time = data[(data["ReferenceBenchmark"] == predicted_benchmark) 
                          & (data["ApplyToBenchmark"] == apply_to_benchmark)]["Time"].iloc[0]
        
            
            add_value(benchmarks_energy, apply_to_benchmark, energy)
            add_value(benchmarks_time, apply_to_benchmark, time)

    averages = []        
    
    for benchmark in benchmarks:
        mean = np.mean(benchmarks_energy[benchmark])
        averages.append(mean)
    
    return np.array(averages).reshape(-1, 1)
            
def compare_p(data):
    test_p = [1,2,3]#,4]
    test_p_str = ['LeaveOneOut', 'LeaveTwoOut', 'LeaveThreeOut'] #'LeaveFourOut']
    
    averages = []       
    for i in test_p:
        p_averages = leave_p_out(i, data)
        
        averages.append(p_averages)
    
    averages = np.concatenate(averages, axis=1)
    
    return pd.DataFrame(averages, index=benchmarks, columns=test_p_str)


def cross_o3_relative_data(leave_out_data, average_data, benchmarks):
    o3_data = average_data.loc[average_data["Flags"] == "-O3"]
    
    relative_data = leave_out_data.copy(deep=True)
    
    for benchmark in benchmarks:
        o3 = o3_data.loc[average_data["Benchmark"] == benchmark]
        o3_energy = o3.iloc[0]["Energy"]
        o3_time = o3.iloc[0]["Time"]

        relative_data.loc[benchmark, "LeaveOneOut"] /= o3_energy
        relative_data.loc[benchmark, "LeaveTwoOut"] /= o3_energy
        relative_data.loc[benchmark, "LeaveThreeOut"] /= o3_energy

    return relative_data


def ce_o3_relative_data(average_data, benchmarks):
    o3_data = average_data.loc[average_data["Flags"] == "-O3"]
    
    relative_data = average_data.copy(deep=True)
    
    for benchmark in benchmarks:
        o3 = o3_data.loc[relative_data["Benchmark"] == benchmark]
        o3_energy = o3.iloc[0]["Energy"]
        o3_time = o3.iloc[0]["Time"]

        relative_data.loc[relative_data["Benchmark"] == benchmark, "Energy"] /= o3_energy
        relative_data.loc[relative_data["Benchmark"] == benchmark, "Time"] /= o3_time

    return relative_data

In [None]:
ce_relative_data = ce_o3_relative_data(ce_data, benchmarks)
ce_relative_data.head()

In [None]:
cross_relative_data = cross_o3_relative_data(compare_p(cross_apply_data), ce_data, benchmarks)
cross_relative_data.head()

In [None]:
def double_plot_relative_to_o3(cross_relative_data, ce_relative_data):
    X = list(benchmarks)
    
    X.remove('lbm') # Something wrong here
    
    best_known = [ce_relative_data.loc[ce_relative_data["Benchmark"] == benchmark, 'Energy'].min() for benchmark in X]
    loo_energy = [cross_relative_data.loc[benchmark, 'LeaveOneOut'] for benchmark in X]
    lto_energy = [cross_relative_data.loc[benchmark, 'LeaveTwoOut'] for benchmark in X]
    ltho_energy = [cross_relative_data.loc[benchmark, 'LeaveThreeOut'] for benchmark in X]
    
#     time = [relative_data.loc[relative_data["Benchmark"] == benchmark, 'Time'].min() for benchmark in benchmarks]
    

    plt.figure(figsize=(12, 6))
    plt.scatter(X, best_known, marker='<', label='Best Known')
#     plt.scatter(X, loo_energy, marker='o', label='1NN (Leave One Out)')
#     plt.scatter(X, lto_energy, marker='x', label='1NN (Leave Two Out)')
    plt.scatter(X, ltho_energy, marker='v', label='1NN (Leave Three Out)')
#     plt.scatter(X, time, marker='x', label='Time')
    
    plt.axhline(1, label='O3')
    
    plt.title('Blah.')
    plt.ylabel('Relative to -O3)')
    plt.xticks(rotation=90)
    plt.legend()
    plt.grid()
    
    
double_plot_relative_to_o3(cross_relative_data, ce_relative_data)