In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error


# Data Functions

In [None]:
# read dataset from a path
def read_dataset_single(path):
    #returns a panda object with just the 3 columns Node, y, x
    df = pd.read_csv(path, sep='\t')
    df = df[['Node Name', 'Total-time', 'Data Size']]
    return df

# read dataset from a list of paths
def read_dataset_all(name_list):
    # returns a panda object with all datasets combined
    df_result = pd.DataFrame(['Node Name', 'Total-time', 'Data Size'])
    
    for path in name_list:
        df_specific_result = read_dataset_single(path)
        df_result = pd.concat([df_result, df_specific_result])
        
    return df_result


# build a cleaned dataset with a minimum number of nodes, determined by the threshold
def clean_data(df, threshold):
    
    nodename_counts = df['Node Name'].value_counts()
    valid_nodenames = nodename_counts[nodename_counts >= threshold].index.tolist()
    
    return df[df['Nodename'].isin(valid_nodenames)]


# returns array with all the unique nodenames
def give_nodenames(df):
    return np.unique(df['Node Name'].values)


# returns just the x and y values for a Node as np.arrays
# they are returned as sorted (by x) and unique
# sort values (for easier fitting and interpolation)
def give_x_and_y_per_node(df_general_result, name):

    df = df_general_result[df_general_result['Node Name'] == name]
    x, y = df['Data Size'].values, df['Total-time'].values
    
    unique_x, unique_indices = np.unique(x, return_index=True)
    unique_y = y[unique_indices]

    sorted_indices = np.argsort(unique_x)
    
    return unique_x[sorted_indices], unique_y[sorted_indices]



# Plotting Function

In [None]:
import matplotlib.pyplot as plt

def plot_data_and_fitted_functions(x_values, y_values, function_to_plot, titel_name):
    
    fig, ax = plt.subplots()

    # plot x,y
    ax.scatter(x_values, y_values, label="Data Points", color='b', marker='o')

    # create estimations for plotting the function
    x_range = np.linspace(min(x_values), max(x_values), 100)
    y_range = function_to_plot(x_range)

    ax.plot(x_range, y_range, label="function for estimation", color='r')

    ax.set_xlabel("X Values")
    ax.set_ylabel("Y Values")
    ax.set_title(titel_name)

    ax.legend()

    plt.show()

# Necessery Functions for interpolation and curvefitting (poly1..4 and log1..4)

In [None]:
#for estimation
def polynominal_estimation(x, coefficients):
    return np.polyval(coefficients, x)

def log_estimation(x, coefficients):
    return np.log(np.polyval(coefficients, x))


# for curve fitting
#--------polynominal----------
def poly_1_curve(x, a, b):
    return np.polyval([a,b], x)

def poly_2_curve(x, a, b, c):
    return np.polyval([a,b,c], x)

def poly_3_curve(x, a, b, c, d):
    return np.polyval([a,b,c,d], x)

def poly_4_curve(x, a, b, c, d, e):
    return np.polyval([a,b,c,d,e], x)

#----------log----------
def log_1_curve(x, a, b):
    return np.log(np.polyval([a,b], x))

def log_2_curve(x, a, b, c):
    return np.log(np.polyval([a,b,c], x))

def log_3_curve(x, a, b, c, d):
    return np.log(np.polyval([a,b,c,d], x))

def log_4_curve(x, a, b, c, d, e):
    return np.log(np.polyval([a,b,c,d,e], x))

# Fitting Functions

In [None]:
from scipy.optimize import curve_fit


#----------Fitting (Standart)----------
def polynomial_fit(x, y, degree):
    return np.polyfit(x, y, degree)

# basically same function, but for better understanding here
def log_polynominal_fitting(x, y_exp, degree):
    return np.polyfit(x, y_exp, degree)


#----------Curve Fitting----------
# for large datasets, u should give a rough guess to speed up the fitting
# here we usually use the results from the previous interpolations
def curve_fitting_coefficients(x_values, y_values, function, guess):
    coeff, _ = curve_fit(function, x_values, y_values, p0=guess)
    return coeff

# Spline Functions

In [None]:
import scipy.interpolate as sci 
# more complicated than before, but the old stuff was deprecated and mostly legacy code
# here we can also give us the spline functions and how they look
# I implemented the fitting and the interpolation


#----------Interpolation----------
# import UnivariateSpline
# from scipy.interpolate import InterpolatedUnivariateSpline
# scipy.interpolate import make_smoothing_spline
# for smoothing
def get_smoothing_approximisation(values):
    



#----------Spline interpolation (Standart)----------
# it will create a spline function object over all datapoints with degreee k
def spline_interpolation(x_values, y_values, degree):
    return sci.make_interp_spline(x_values, y_values, k=degree)

#----------Spline with smoothing----------
# create spline object with smoothing build in (less knots)
def spline_fitting_with_smoothing(x_values, y_values, degree):
    return sci.UnivariateSpline


    

#----------Fitting----------

#----------Smooth Spline without k----------
def smooth_spline_fitting(x_values, y_values):
    return sci.make_smoothing_spline(x_values, y_values)

# Explanation for Smoothing
If the weights represent the inverse of the standard-deviation of y, then a good s value should be found in the range (m-sqrt(2*m),m+sqrt(2*m)) where m is the number of datapoints in x, y, and w. This means s = len(w) should be a good value if 1/w[i] is an estimate of the standard deviation of y[i].

from https://docs.scipy.org/doc/scipy/reference/generated/scipy.interpolate.UnivariateSpline.html

# Necessary Constants

In [None]:
from pathlib import Path

path_finals = [Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_viral500m.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_archea1.4g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria30g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria58g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria88g.tsv"),
               Path("/Users/manuez42/Desktop/fonda/A2-job-granularity/MG-HIBF/general_results_fonda/general_results_bacteria125g.tsv")]

name_list = ["viral500m", "archea1.4g", "bacteria30g", "bacteria58", "bacteria88", "bacteria125"]