In [105]:
import pandas as pd
import numpy as np



In [106]:
# Test Test Test



# Data Functions

In [107]:
# read dataset from a path
def read_dataset_single(path):
    #returns a panda object with just the 3 columns Node, y, x
    df = pd.read_csv(path, sep='\t')
    df = df[['Node Name', 'Total-time', 'Data Size']]
    return df

# read dataset from a list of paths
def read_dataset_all(name_list):
    # returns a panda object with all datasets combined
    df_result = pd.DataFrame(['Node Name', 'Total-time', 'Data Size'])
    
    for path in name_list:
        df_specific_result = read_dataset_single(path)
        df_result = pd.concat([df_result, df_specific_result])
        
    return df_result


# build a cleaned dataset with a minimum number of nodes, determined by the threshold
def clean_data(df, threshold):
    
    nodename_counts = df['Node Name'].value_counts()
    valid_nodenames = nodename_counts[nodename_counts >= threshold].index.tolist()
    
    return df[df['Node Name'].isin(valid_nodenames)]


# returns array with all the unique nodenames
def give_nodenames(df):
    return np.unique(df['Node Name'].values)


# returns just the x and y values for a Node as np.arrays
# they are returned as sorted (by x) and unique
# sort values (for easier fitting and interpolation)
def give_x_and_y_per_node(df_general_result, name):

    df = df_general_result[df_general_result['Node Name'] == name]
    x, y = df['Data Size'].values, df['Total-time'].values
    
    unique_x, unique_indices = np.unique(x, return_index=True)
    unique_y = y[unique_indices]

    sorted_indices = np.argsort(unique_x)
    
    return unique_x[sorted_indices], unique_y[sorted_indices]



# Plotting Function

In [108]:
import matplotlib.pyplot as plt

def plot_data_and_fitted_functions(x_values, y_values, function_to_plot, titel_name):
    
    fig, ax = plt.subplots()

    # plot x,y
    ax.scatter(x_values, y_values, label="Data Points", color='b', marker='o')

    # create estimations for plotting the function
    x_range = np.linspace(min(x_values), max(x_values), 100)
    y_range = function_to_plot(x_range)

    ax.plot(x_range, y_range, label="function for estimation", color='r')

    ax.set_xlabel("X Values")
    ax.set_ylabel("Y Values")
    ax.set_title(titel_name)

    ax.legend()

    plt.show()

# Necessery Functions for interpolation and curvefitting (poly1..4 and log1..4)

In [109]:
#for estimation
def polynominal_estimation(x, coefficients):
    return np.polyval(coefficients, x)

def log_polynominal_estimation(x, coefficients):
    return np.log(np.polyval(coefficients, x))


# for curve fitting
#--------polynominal----------
def poly_1_curve(x, a, b):
    return np.polyval([a,b], x)

def poly_2_curve(x, a, b, c):
    return np.polyval([a,b,c], x)

def poly_3_curve(x, a, b, c, d):
    return np.polyval([a,b,c,d], x)

def poly_4_curve(x, a, b, c, d, e):
    return np.polyval([a,b,c,d,e], x)

#----------log----------
def log_1_curve(x, a, b):
    return np.log(np.polyval([a,b], x))

def log_2_curve(x, a, b, c):
    return np.log(np.polyval([a,b,c], x))

def log_3_curve(x, a, b, c, d):
    return np.log(np.polyval([a,b,c,d], x))

def log_4_curve(x, a, b, c, d, e):
    return np.log(np.polyval([a,b,c,d,e], x))

# Fitting Functions

In [110]:
from scipy.optimize import curve_fit


#----------Fitting (Standart)----------
def polynomial_fit(x, y, degree):
    return np.polyfit(x, y, degree)

# basically same function, but for better understanding here
def log_polynominal_fitting(x, y_exp, degree):
    return np.polyfit(x, y_exp, degree)

# deprecated Polynominal.fit is the new one
#

#----------Curve Fitting----------
# for large datasets, u should give a rough guess to speed up the fitting
# here we usually use the results from the previous interpolations
def curve_fitting_coefficients(x_values, y_values, function, guess):
    coeff, _ = curve_fit(function, x_values, y_values, p0=guess)
    return coeff

# Spline Functions

In [111]:
import scipy.interpolate as sci
# more complicated than before, but the old stuff was deprecated and mostly legacy code
# here we can also give us the spline functions and how they look
# I implemented the fitting and the interpolation

# for Spline prediction we need some help values

# gives me t_knots as a regular intervall between start and endpoint with k+1 regularity
# if there are too few data points it just takes them in an uniform distance.
# def get_t_Knots_regular(x_values, k_degree):
    
#     r = (2 * k_degree + 1) if (len(x_values) > 2 * (k_degree + 1)) else len(x_values)
    
#     return np.linspace(x_values[0], x_values[-1], r)



# import UnivariateSpline
# scipy.interpolate import make_smoothing_spline
# for smoothing
def get_s_smoothing_approximation_1(y_values):
    return len(y_values) - np.sqrt(2 * len(y_values))

def get_s_smoothing_approximation_2(y_values):
    return len(y_values) * np.var(y_values)


#----------Spline with smoothing----------
# create spline object with smoothing build in (less knots)
# def spline_fitting_with_smoothing_1(x_values, y_values, degree, smooth):
#     return sci.UnivariateSpline(x_values, y_values, k=degree, s=smooth)

# sci.splrep()
# sci.make_lsq_spline()
    
#----------Fitting----------

# #----------Smooth Spline without k----------
# def smooth_spline_fitting(x_values, y_values):
#     return sci.make_smoothing_spline(x_values, y_values)

# Smoothing Coniditions explanation
https://en.wikipedia.org/wiki/Smoothing_spline

In [112]:
# def build_solution_overview_by_genom(list_of_Paths, list_of_names, threshold):
    

In [113]:
"""calculating and comparing mean square error"""
from sklearn.metrics import mean_squared_error

def calculates_fitting_and_coefficients_polyfit(comparing_dict, index, x_values, y_values, function, estimate_func, degree):
    comparing_dict['method'][index] = function.__name__ + str(degree)
    comparing_dict['coefficients'][index] = function(x_values, y_values, degree)
    comparing_dict['function'][index] = estimate_func
    comparing_dict['mse'][index] = mean_squared_error(
        y_values, estimate_func(x_values, comparing_dict['coefficients'][index]))

def calculates_fitting_and_coefficients_curve(comparing_dict, index, x_values, y_values, function, guess):
    comparing_dict['method'][index] = function.__name__
    comparing_dict['coefficients'][index] = curve_fitting_coefficients(x_values, y_values, function, guess)
    comparing_dict['function'][index] = function
    comparing_dict['mse'][index] = mean_squared_error(
        y_values, function(x_values, *comparing_dict['coefficients'][index]))

    
def calculates_fitting_and_coefficients_spline(comparing_dict, index, x_values, y_values, function):
    comparing_dict['method'][index] = function.__name__
    comparing_dict['function'][index] = function(x_values, y_values)
    comparing_dict['mse'][index] = mean_squared_error(
        y_values, comparing_dict['function'][index](x_values))
    


# returns a dict with the min_mse, best method, coefficients of the method and function of the method
def comparing_fitting_methods(x_values, y_values, group_of_methods):

    k = 0
    polynominal = False
    poly_log = False
    poly_curve = False
    log_curve = False
    spline = False

    for key, value in group_of_methods.items():
        k += len(value)
        polynominal |= key == 'polynominal'
        poly_log |= key == 'log_polynominal'
        poly_curve |= key == 'polynominal_curve'
        log_curve |= key == 'log_curve'
        spline |= key == 'spline'


    if poly_log or log_curve:
        y_log = np.log(y_values)


    comparing_dict = {'mse': np.empty(k), 
                      'method': [None] * k, 
                      'coefficients': [None] * k, 
                      'function': [None] * k}
    

    if polynominal:
        for i in group_of_methods['polynominal']:
            calculates_fitting_and_coefficients_polyfit(comparing_dict, k-i, x_values, y_values, polynomial_fit, polynominal_estimation, i)
        
        k -= len(group_of_methods['polynominal'])

    if poly_log:
        for i in group_of_methods['log_polynominal']:
            calculates_fitting_and_coefficients_polyfit(comparing_dict, k-i, x_values, y_log, polynomial_fit, log_polynominal_estimation, i)
    
        k -= len(group_of_methods['log_polynominal'])

    if poly_curve:
        calculates_fitting_and_coefficients_curve(comparing_dict, k-1, x_values, y_values, poly_1_curve, comparing_dict['coefficients'][k+7])
        calculates_fitting_and_coefficients_curve(comparing_dict, k-2, x_values, y_values, poly_2_curve, comparing_dict['coefficients'][k+6])
        calculates_fitting_and_coefficients_curve(comparing_dict, k-3, x_values, y_values, poly_3_curve, comparing_dict['coefficients'][k+5])
        calculates_fitting_and_coefficients_curve(comparing_dict, k-4, x_values, y_values, poly_4_curve, comparing_dict['coefficients'][k+4])
        
        k -= 4

    if log_curve:
        calculates_fitting_and_coefficients_curve(comparing_dict, k-1, x_values, y_log, log_1_curve, comparing_dict['coefficients'][k+7])
        calculates_fitting_and_coefficients_curve(comparing_dict, k-2, x_values, y_log, log_2_curve, comparing_dict['coefficients'][k+6])
        calculates_fitting_and_coefficients_curve(comparing_dict, k-3, x_values, y_log, log_3_curve, comparing_dict['coefficients'][k+5])
        calculates_fitting_and_coefficients_curve(comparing_dict, k-4, x_values, y_log, log_4_curve, comparing_dict['coefficients'][k+4])  

        k -= 4

    # if spline:

    #     calculates_fitting_and_coefficients_spline(comparing_dict, k-1, x_values, y_values, spline_fitting_linear)
    #     calculates_fitting_and_coefficients_spline(comparing_dict, k-2, x_values, y_values, spline_fitting_quadratic)
    #     calculates_fitting_and_coefficients_spline(comparing_dict, k-3, x_values, y_values, spline_fitting_cubic)

    #     k -= 3
    
    min_mse_index = np.argmin(comparing_dict['mse'])
    # return {
    #     'min_mse': comparing_dict['mse'][min_mse_index], 
    #     'method': comparing_dict['method'][min_mse_index], 
    #     'coefficients': comparing_dict['coefficients'][min_mse_index], 
    #     'best_function': comparing_dict['function'][min_mse_index]}
    return {'best_solution': {
        'min_mse': comparing_dict['mse'][min_mse_index], 
        'method': comparing_dict['method'][min_mse_index], 
        'coefficients': comparing_dict['coefficients'][min_mse_index], 
        'best_function': comparing_dict['function'][min_mse_index]
        } ,'all_solutions': comparing_dict}

In [114]:
def build_solution_overview_by_node(list_of_Paths, list_of_names, threshold, group_of_methods):
    df = read_dataset_all(list_of_Paths)
    df = clean_data(df, threshold)
    nodenames = give_nodenames(df)
    # col_names = ['Node Name'] + [('polynominal_'+str(i), 'coefficients','function', 'mse') for i in range(1,poly+1)] + [('log'+str(i), 'coefficients', 'function', 'mse') for i in range(1,log+1)] + [('polynominal_'+str(i), 'coefficients', 'function','mse') for i in range(1,spline+1)]
    result_dict = {}
    for name in nodenames:
        x_sorted, y_sorted = give_x_and_y_per_node(df, name)
        result_dict[name] = comparing_fitting_methods(x_sorted, y_sorted, group_of_methods)
    
    result_df = pd.DataFrame(result_dict)
    result_df = result_df.T
    result_df.to_csv('test.tsv', sep='\t')
    print(result_df)
    

# Necessary Constants

In [115]:
from pathlib import Path

path_finals = [Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_viral500m.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_archea1.4g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria30g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria58g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria88g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria125g.tsv")]

name_list = ["viral500m", "archea1.4g", "bacteria30g", "bacteria58", "bacteria88", "bacteria125"]

group_of_methods = {
        # define here what you want
        'polynominal' : [1,2,3,4],
        'log_polynominal' : [1,2,3,4],
        'polynominal_curve' : [1,2,3,4],
        'log_curve': [1,2,3,4]
        # 'spline' : [1,2,3]
    }

threshold=10

build_solution_overview_by_node(path_finals, name_list, threshold, group_of_methods)

  return np.log(np.polyval([a,b], x))
  return np.log(np.polyval([a,b], x))
  return np.log(np.polyval([a,b,c], x))
  return np.log(np.polyval([a,b,c], x))


                                           best_solution  \
201.0  {'min_mse': 0.04031752036593989, 'method': 'lo...   
202.0  {'min_mse': 0.0008636330198986344, 'method': '...   
206.0  {'min_mse': 4.7016109951172305e-29, 'method': ...   
208.0  {'min_mse': 0.054421588983530444, 'method': 'l...   
213.0  {'min_mse': 0.21640400324743372, 'method': 'lo...   
215.0  {'min_mse': 0.5860881926468772, 'method': 'log...   
216.0  {'min_mse': 0.14000714545318485, 'method': 'lo...   
217.0  {'min_mse': 0.2986202716655273, 'method': 'log...   
218.0  {'min_mse': 0.12332328256740681, 'method': 'lo...   
219.0  {'min_mse': 0.25496007139113885, 'method': 'lo...   
220.0  {'min_mse': 0.22703592842375206, 'method': 'lo...   
221.0  {'min_mse': 0.2900945995815267, 'method': 'log...   
222.0  {'min_mse': 0.25435081802295956, 'method': 'lo...   
223.0  {'min_mse': 0.4299461436329097, 'method': 'log...   
224.0  {'min_mse': 0.32993557447306354, 'method': 'lo...   
225.0  {'min_mse': 0.3188396461397473, '

  return np.log(np.polyval([a,b], x))
  return np.log(np.polyval([a,b,c], x))
  return np.log(np.polyval([a,b,c,d], x))
