In [302]:
import pandas as pd
import numpy as np
from pathlib import Path
import scipy.interpolate as sci

In [303]:
from sklearn.metrics import mean_squared_error

def mse(value_true, value_prediction):
    return mean_squared_error(value_true, value_prediction)

def rmse(value_true, value_prediction):
    return mean_squared_error(value_true, value_prediction, squared=False)

# Data Functions

In [304]:
# read dataset from a path
def read_dataset_single(path):
    #returns a panda object with just the 3 columns Node, y, x
    df = pd.read_csv(path, sep='\t')
    df = df[['Node Name', 'Total-time', 'Data Size']]
    return df

# read dataset from a list of paths
def read_dataset_all(name_list):
    # returns a panda object with all datasets combined
    df_result = pd.DataFrame(['Node Name', 'Total-time', 'Data Size'])
    
    for path in name_list:
        df_specific_result = read_dataset_single(path)
        df_result = pd.concat([df_result, df_specific_result])
        
    return df_result


# build a cleaned dataset with a minimum number of nodes, determined by the threshold
def clean_data(df, threshold):
    
    nodename_counts = df['Node Name'].value_counts()
    valid_nodenames = nodename_counts[nodename_counts >= threshold].index.tolist()
    
    return df[df['Node Name'].isin(valid_nodenames)]


# returns array with all the unique nodenames
def give_nodenames(df):
    return np.unique(df['Node Name'].values)


# returns just the x and y values for a Node as np.arrays
# they are returned as sorted (by x) and unique
# sort values (for easier fitting and interpolation)
def give_x_and_y_per_node(df_general_result, name):

    df = df_general_result[df_general_result['Node Name'] == name]
    x, y = df['Data Size'].values, df['Total-time'].values
    
    unique_x, unique_indices = np.unique(x, return_index=True)
    unique_y = y[unique_indices]

    sorted_indices = np.argsort(unique_x)
    
    return unique_x[sorted_indices], unique_y[sorted_indices]

# Plotting Function

In [305]:
import matplotlib.pyplot as plt

def plot_data_and_fitted_functions(x_values, y_values, function_to_plot, titel_name):
    
    fig, ax = plt.subplots()

    # plot x,y
    ax.scatter(x_values, y_values, label="Data Points", color='b', marker='o')

    # create estimations for plotting the function
    x_range = np.linspace(min(x_values), max(x_values), 100)
    y_range = function_to_plot(x_range)

    ax.plot(x_range, y_range, label="function for estimation", color='r')

    ax.set_xlabel("X Values")
    ax.set_ylabel("Y Values")
    ax.set_title(titel_name)

    ax.legend()

    plt.show()

# Necessery Functions for Polyfit (poly1..4 and log1..4)

In [306]:
#for estimation of Polynominal
def polynominal_estimation(x, coefficients):
    return np.polyval(coefficients, x)

# Curvefitting is optimized for log and exp functions
# Polyfit is optimizied for polynominals
# def log_polynominal_estimation(x, coefficients):
#     return np.log(np.polyval(coefficients, x))

# # for curve fitting
# #--------polynominal----------
def poly_1_curve(x, a, b):
    return np.polyval([a,b], x)

def poly_2_curve(x, a, b, c):
    return np.polyval([a,b,c], x)

def poly_3_curve(x, a, b, c, d):
    return np.polyval([a,b,c,d], x)

def poly_4_curve(x, a, b, c, d, e):
    return np.polyval([a,b,c,d,e], x)

#----------log----------
def log_1_curve(x, a, b):
    return np.log(np.polyval([a,b], x))

def log_2_curve(x, a, b, c):
    return np.log(np.polyval([a,b,c], x))

def log_3_curve(x, a, b, c, d):
    return np.log(np.polyval([a,b,c,d], x))

def log_4_curve(x, a, b, c, d, e):
    return np.log(np.polyval([a,b,c,d,e], x))

# Fitting Functions

In [307]:
from scipy.optimize import curve_fit


#----------Fitting (Standart)----------
def polynomial_fit(x, y, degree):
    return np.polyfit(x, y, degree)

# # basically same function, but for better understanding here
# def log_polynominal_fitting(x, y_exp, degree):
#     return np.polyfit(x, y_exp, degree)

# deprecated Polynominal.fit is the new one
#

#----------Curve Fitting----------
# for large datasets, u should give a rough guess to speed up the fitting
# here we usually use the results from the previous interpolations
def curve_fitting_coefficients(x_values, y_values, function, guess=''):
    coeff, _ = curve_fit(function, x_values, y_values, p0=guess)
    return coeff

In [308]:
def make_log(x_values, y_values, group_of_methods, Node, Dataset):
    
    list_of_functions = [log_1_curve, log_2_curve, log_3_curve, log_4_curve]
    
    n = len(group_of_methods['log']) +  len(group_of_methods['log_curve'])
    
    return_df = pd.DataFrame(columns=['Reference Genom', 'Node Name', 'prediction method', 'mse', 'rmse'])
    ref_genom = np.repeat( a=Dataset, repeats=n)
    n_name = np.repeat( a=Node, repeats=n)

    method = np.empty(n, dtype = str)
    mses = np.empty(n)
    rmses = np.empty(n)

    
    m = len(group_of_methods['log'])
    coeffs = np.empty(m)
    
    y_log = np.log(y_values)
    
    for i in group_of_methods['log']:
        
        coeff_ = np.polyfit(x=x_values, y=y_log, deg=i)
        y_pred = np.polyval(coeff_, x_values)
        method[i-1] = f"{'Log_Polynominal '} {i}"
        mses[i-1] = np.exp(mse(y_log, y_pred))
        rmses[i-1] = np.exp(rmse(y_log, y_pred))
        coeffs[i-1] = coeff_

        
        
    for i in group_of_methods['log_curve']:

        coeff = curve_fitting_coefficients(x_values, y_log, list_of_functions[i-1], coeffs[i-1])
        y_pred = np.polyval(coeff, x_values)
        method[m+i-1] = f"{'Log_curve '} {i}"
        mses[m+i-1] = np.exp(mse(y_log, y_pred))
        rmses[m+i-1] = np.exp(rmse(y_log, y_pred))

    return_df['Reference Genom'] = ref_genom
    return_df['Node Name'] = n_name
    return_df['prediction method'] = method
    return_df['mse'] = mses
    return_df['rmse'] = rmses
    
    return return_df

In [309]:
def make_poly(x_values, y_values, group_of_methods, Node, Dataset):
    
    list_of_functions = [poly_1_curve, poly_2_curve, poly_3_curve, poly_4_curve]
    
    n = len(group_of_methods['poly']) +  len(group_of_methods['poly_curve'])
    
    return_df = pd.DataFrame(columns=['Reference Genom', 'Node Name', 'prediction method', 'mse', 'rmse'])
    ref_genom = np.repeat( a=Dataset, repeats=n)
    n_name = np.repeat( a=Node, repeats=n)

    method = np.empty(n, dtype = str)
    mses = np.empty(n)
    rmses = np.empty(n)

    m = len(group_of_methods['poly'])
    coeffs = np.empty(m)
    
    for i in group_of_methods['poly']:
        
        coeff_ = np.polyfit(x=x_values, y=y_values, deg=i)
        y_pred = np.polyval(coeff_, x_values)
        method[i-1] = f"{'Polynominal '} {i}"
        mses[i-1] = mse(y_values, y_pred)
        rmses[i-1] = rmse(y_values, y_pred)
        coeffs[i-1] = coeff_

        
        
    for i in group_of_methods['poly_curve']:

        coeff = curve_fitting_coefficients(x_values, y_values, list_of_functions[i-1], coeffs[i-1])
        y_pred = np.polyval(coeff, x_values)
        method[m+i-1] = f"{'Polynominal_curve '} {i}"
        mses[m+i-1] = mse(y_values, y_pred)
        rmses[m+i-1] = rmse(y_values, y_pred)

    return_df['Reference Genom'] = ref_genom
    return_df['Node Name'] = n_name
    return_df['prediction method'] = method
    return_df['mse'] = mses
    return_df['rmse'] = rmses
    
    return return_df

# Spline Functions

In [310]:
import scipy.interpolate as sci
# more complicated than before, but the old stuff was deprecated and mostly legacy code
# here we can also give us the spline functions and how they look
# I implemented the fitting and the interpolation

# for Spline prediction we need some help values

# gives me t_knots as a regular intervall between start and endpoint with k+1 regularity
# if there are too few data points it just takes them in an uniform distance.
# def get_t_Knots_regular(x_values, k_degree):
    
#     r = (2 * k_degree + 1) if (len(x_values) > 2 * (k_degree + 1)) else len(x_values)
    
#     return np.linspace(x_values[0], x_values[-1], r)



# import UnivariateSpline
# scipy.interpolate import make_smoothing_spline
# for smoothing
def get_s_smoothing_approximation_1(y_values):
    return len(y_values) - np.sqrt(2 * len(y_values))

def get_s_smoothing_approximation_2(y_values):
    return len(y_values) * np.var(y_values)


#----------Spline with smoothing----------
# create spline object with smoothing build in (less knots)
# def spline_fitting_with_smoothing_1(x_values, y_values, degree, smooth):
#     return sci.UnivariateSpline(x_values, y_values, k=degree, s=smooth)

# needs weights or smoothing condition
# sci.splrep()

# need knots t
# sci.make_lsq_spline()
    
#----------Fitting----------

# make_smoothing_spline
# needs x,y
# make_smoothing_spline(x, y, w=None, lam=None)
# returns BSpline object

# relevant results for a spline
# Knots, coefficient, degree

# Smoothing Coniditions explanation
https://en.wikipedia.org/wiki/Smoothing_spline

In [311]:
def make_sspline(x_values, y_values, group_of_methods, Node, Dataset):
    # make_smoothing_spline
    # needs x,y
    # make_smoothing_spline(x, y, w=None, lam=None)
    # returns BSpline object
    
    
    return_df = pd.DataFrame(columns=['Reference Genom', 'Node Name', 'prediction method', 'mse', 'rmse'])
    ref_genom = np.repeat( a=Dataset, repeats=1)
    n_name = np.repeat( a=Node, repeats=1)

    method = np.empty(1, dtype = str)
    mses = np.empty(1)
    rmses = np.empty(1)
    
    bspline = sci.make_smoothing_spline(x=x_values, y=y_values)
    y_pred = bspline(x=x_values)
    method[0] = f"{'Smoothing Spline'}"
    mses[0] = mse(y_values, y_pred)
    rmses[0] = rmse(y_values, y_pred)
    
    return_df['Reference Genom'] = ref_genom
    return_df['Node Name'] = n_name
    return_df['prediction method'] = method
    return_df['mse'] = mses
    return_df['rmse'] = rmses
    
    return return_df

In [312]:
def make_uspline(x_values, y_values, group_of_methods, Node, Dataset):
    # univariantspline
    # needs x,y, w or s, k
    # UnivariateSpline(x, y, w=None, bbox=[None, None], k=3, s=None, ext=0, check_finite=False)
    # is an univariantspline object
    
    return_df = pd.DataFrame(columns=['Reference Genom', 'Node Name', 'prediction method', 'mse', 'rmse'])
    
    n = len(group_of_methods['univariant_spline_1']) +  len(group_of_methods['univariant_spline_2'])
    ref_genom = np.repeat( a=Dataset, repeats=n)
    n_name = np.repeat( a=Node, repeats=n)

    method = np.empty(n, dtype = str)
    mses = np.empty(n)
    rmses = np.empty(n)
    
    s1 = get_s_smoothing_approximation_1(y_values)
    s2 = get_s_smoothing_approximation_2(y_values)
    m = len(group_of_methods['univariant_spline_1'])
    
    for i in group_of_methods['univariant_spline_1']:
        uspline = sci.UnivariateSpline(x=x_values, y=y_values, s=s2, k=i)
        method[i-1] = f"{'Univariatn Spline smoothing approximation 1 with degree '} {i} "
        y_pred = uspline(x=x_values)
        mses[i-1] = mse(y_values, y_pred)
        rmses[i-1] = rmse(y_values, y_pred)
    
    
    for i in group_of_methods['univariant_spline_2']:
        uspline = sci.UnivariateSpline(x=x_values, y=y_values, s=s1, k=i)
        method[m+i-1] = f"{'Univariatn Spline smoothing approximation 2 with degree '} {i} "
        y_pred = uspline(x=x_values)
        mses[m+i-1] = mse(y_values, y_pred)
        rmses[m+i-1] = rmse(y_values, y_pred)
    

    
    return_df['Reference Genom'] = ref_genom
    return_df['Node Name'] = n_name
    return_df['prediction method'] = method
    return_df['mse'] = mses
    return_df['rmse'] = rmses
    
    return return_df

In [313]:
def make_splinerep(x_values, y_values, group_of_methods, Node, Dataset):
    # splrep
    # needs x,y, w or s, k
    # splrep(x, y, w=None, xb=None, xe=None, k=3, task=0, s=None, t=None, full_output=0, per=0, quiet=1)
    # returns parameters for a bsplineobject

    return_df = pd.DataFrame(columns=['Reference Genom', 'Node Name', 'prediction method', 'mse', 'rmse'])
    
    n = len(group_of_methods['spline_approximation_1']) +  len(group_of_methods['spline_approximation_2'])
    ref_genom = np.repeat( a=Dataset, repeats=n)
    n_name = np.repeat( a=Node, repeats=n)

    method = np.empty(n, dtype = str)
    mses = np.empty(n)
    rmses = np.empty(n)
    
    s1 = get_s_smoothing_approximation_1(y_values)
    s2 = get_s_smoothing_approximation_2(y_values)
    m = len(group_of_methods['spline_approximation_1'])
    
    for i in group_of_methods['spline_approximation_1']:
        tck =  sci.splrep(x=x_values, y=y_values, s=s1, k=i)
        method[i-1] = f"{'Generell Spline approximation smoothing s1 with degree '} {i} "
        y_pred = sci.splev(x=x_values, tck=tck)
        mses[i-1] = mse(y_values, y_pred)
        rmses[i-1] = rmse(y_values, y_pred)
    
    
    for i in group_of_methods['spline_approximation_2']:
        tck =  sci.splrep(x=x_values, y=y_values, s=s2, k=i)
        method[m+i-1] = f"{'Generell Spline approximation smoothing s2 with degree '} {i} "
        y_pred = sci.splev(x=x_values, tck=tck)
        mses[m+i-1] = mse(y_values, y_pred)
        rmses[m+i-1] = rmse(y_values, y_pred)
    

    return_df['Reference Genom'] = ref_genom
    return_df['Node Name'] = n_name
    return_df['prediction method'] = method
    return_df['mse'] = mses
    return_df['rmse'] = rmses
    
    return return_df

In [314]:
# returns a dict with the min_mse, best method, coefficients of the method and function of the method
def comparing_fitting_methods(x_values, y_values, group_of_methods, Node, Dataset):

    return_df = pd.DataFrame(columns=['Reference Genom', 'Node Name', 'prediction method', 'mse', 'rmse'])

    poly = False
    log = False
    poly_curve = False
    log_curve = False
    s_spline = False
    u_spline_1 = False
    u_spline_2 = False
    splrep_1 = False
    splrep_2 = False


    for key in group_of_methods.keys():
        poly |= key == 'poly'
        log |= key == 'log'
        poly_curve |= key == 'poly_curve'
        log_curve |= key == 'log_curve'
        s_spline |= key == 'smoothing_spline'
        u_spline_1 |= key == 'univariant_spline_1'
        u_spline_2 |= key == 'univariant_spline_2'
        splrep_1 |= key == 'spline_approximation_1'
        splrep_2 |= key == 'spline_approximation_2'



    if poly or poly_curve:
        tmp_df = make_poly(x_values, y_values, group_of_methods, Node, Dataset)
        return_df = pd.concat(return_df, tmp_df)        
        
    if log or log_curve:
        tmp_df = make_log(x_values, y_values, group_of_methods, Node, Dataset)
        return_df = pd.concat(return_df, tmp_df)   
        
    if s_spline:
        tmp_df = make_sspline(x_values, y_values, group_of_methods, Node, Dataset)
        return_df = pd.concat(return_df, tmp_df)     
        
    if u_spline_1 or u_spline_2:
        tmp_df = make_uspline(x_values, y_values, group_of_methods, Node, Dataset)
        return_df = pd.concat(return_df, tmp_df)   
        
    if splrep_1 or splrep_2:
        tmp_df = make_splinerep(x_values, y_values, group_of_methods, Node, Dataset)
        return_df = pd.concat(return_df, tmp_df)

    return return_df

In [315]:
def build_solution_overview_by_genome(path_df, group_of_methods, Dataset):
    file = pd.ExcelFile(path_df)
    df = pd.read_excel(file,'General-Results')
    # df = read_dataset_all(list_of_Paths)
    # df = clean_data(df, threshold)
    nodenames = give_nodenames(df)
    return_df = pd.DataFrame(columns=['Reference Genom', 'Node Name', 'prediction method', 'mse', 'rmse'])

    # result_df = pd.DataFrame(index=nodenames)
    for name in nodenames:
        x_sorted, y_sorted = give_x_and_y_per_node(df, name)
        tmp_df = comparing_fitting_methods(x_sorted, y_sorted, group_of_methods, name, Dataset)
        return_df = pd.concat(return_df, tmp_df) 
        
    name_csv = Path(Dataset + '.csv')
    name_xlsx = Path(Dataset + '.xlsx')
    return_df.to_csv('test.csv')    
 
    with pd.ExcelWriter('test.xlsx') as writer:

        return_df.to_excel(writer)
    
    print(return_df)
    

# Necessary Constants

In [316]:


# path_finals = [Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_viral500m.tsv"),
#                Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_archea1.4g.tsv"),
#                Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria30g.tsv"),
#                Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria58g.tsv"),
#                Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria88g.tsv"),
#                Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria125g.tsv")]

# name_list = ["viral500m", "archea1.4g", "bacteria30g", "bacteria58", "bacteria88", "bacteria125"]
data_paths = [Path("../general_results_fonda/selected_data/results_archea1.4g_8nodes.xlsx"),
              Path("../general_results_fonda/selected_data/results_archea1.4g_24nodes.xlsx"),
              Path("../general_results_fonda/selected_data/results_bacteria30g_8nodes.xlsx"),
              Path("../general_results_fonda/selected_data/results_bacteria30g_24nodes.xlsx"),
              Path("../general_results_fonda/selected_data/results_bacteria58g_8nodes.xlsx"),
              Path("../general_results_fonda/selected_data/results_bacteria58g_24nodes.xlsx"),
              Path("../general_results_fonda/selected_data/results_bacteria88g_8nodes.xlsx"),
              Path("../general_results_fonda/selected_data/results_bacteria88g_24nodes.xlsx")
]

names = ['archea1.4g_8nodes', 'archea1.4g_24nodes',
         'bacteria30g_8nodes', 'bacteria30g_24nodes',
         'bacteria58g_8nodes', 'bacteria58g_24nodes',
         'bacteria88g_8nodes', 'bacteria88g_24nodes'
]

group_of_methods = {
        # define here what you want
        'poly' : [1,2,3], #polynominal fitting, with degrees
        'poly_curve' : [1,2,3], #polynominal curve fitting, with degrees
        'log' : [1,2,3], #logarithmic fitting, with degrees
        'log_curve': [1,2,3], #logarithmic curvefitting, with degrees
        'smoothing_spline' : [1], #smoothing spline, finds a smooth spline with a degree
        'univariant_spline_1': [1,2,3], #makes an univariant spline (same length of knots), with degree
        'univariant_spline_2': [1,2,3], #makes an univariant spline (same length of knots), with degree
        'spline_approximation_1': [1,2,3], #makes a spline approximation
        'spline_approximation_2': [1,2,3] #makes a spline approximation
    }





threshold=10 #what is the minimum number of datapoints to consider per Node

for p, name in zip(data_paths, names):
    build_solution_overview_by_genome(p, group_of_methods, name)

ValueError: could not convert string to float: 'Polynominal  1'