# Modeling pipeline using concatenation of multiple Time-Series

In this notebook, we create a modeling pipeline using the concatenation of several Time-Series generated by the simulator

## Modules import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import yaml
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import sys
sys.path.append("..")
from uvsw_part import simulation
import copy
from math import e

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

import plotly.graph_objects as go

## Loading the PATH to the Lists file

In [2]:
LIST_PATH_1 = "../data/params/List1.txt"
LIST_PATH_2 = "../data/params/List2.txt"

## Generation of a Time-Series with the simulator with a set of parameters 

Here, we define the function allowing to launch the simulator with the chosen parameters and to compare the result with the reference model.
Visualizations are made with Matplotlib

In [3]:
def generation(h,tension,u,clo,eps,LIST_PATH,TS_INDEX):
    """
    Pipeline from input parameters to prediction models
    
    Paramètres
    ----------
    h_          : float  : Simulator parameter
    tension      : float : Simulator parameter
    u            : float : Simulator parameter
    clo          : float : Simulator parameterr
    LIST_PATH   : string : Path to list file
    TS_INDEX     : int   : Number of the time-series to be executed (starts at 0) 
    
    Sortie :
    ref : Time-Series of the reference model
    sim : Time-Series of the simulation
    """
    #Loading the List * .txt file with the path 
    data_list = pd.read_csv(LIST_PATH , delim_whitespace=True)
    #Loading of the basic parameters of a Time-Series for the simulation with its index (Index starts at 0) 
    set_params = data_list.iloc[TS_INDEX,:]
    
    #Loading of the reference Time-Series according to the List file used
    
    list_number = LIST_PATH[-9:]# String manipulation to keep only the last portion of the path
    
    #Retrieve the List file number used and load the desired csv file accordingly
    
    if(list_number == "List1.txt"):
        ref = pd.read_csv("../data/ref/list1/graph{}.csv".format(set_params["nc"]))
        print("Liste_1")
        
    if(list_number == "List2.txt"):
        ref = pd.read_csv("../data/ref/list2/graph{}.csv".format(set_params["nc"]))
        print("Liste_2")
        
    if(list_number !=  "List1.txt" and list_number != "List2.txt" ):
        print("error")




    #print("################ eps value: {} ################".format(eps_value))
    cfg = yaml.safe_load(open('../data/config/example.in.yaml', 'r'))

    #Loading the parameters to test 

    cfg["cable"]["h"] = float(h)

    cfg["simulation"]["tf"] = float(set_params["tf[s]"])

    cfg["cable"]["tension"] = float(tension)

    cfg["wakeosc"]["u"] = float(u)

    cfg["wakeosc"]["cl0"] = float(clo)

    cfg["wakeosc"]["eps"] = float(eps)

    #Setup of the parameters allowing to produce a simulation of with the same number of points as the reference model


    cfg["simulation"]["dt"] = cfg["simulation"]["tf"] / len(ref) # MODIF DT
    cfg["simulation"]["dr"] = cfg["simulation"]["tf"] / len(ref) # MODIF DR



    print("h value: ", cfg["cable"]["h"], " u value: ", cfg["wakeosc"]["u"]," tension value: ",cfg["cable"]["tension"],
         "clo value ",cfg["wakeosc"]["cl0"])
    print("tf value ", cfg["simulation"]["tf"])
    dfy, _ = simulation.run_cable_wakeosc(cfg)



    plt.figure(figsize = (20,5))
    plt.plot(ref['time'], ref['y/d'], label = "Reference model signal")
    plt.plot(dfy.index, dfy['s=0.250']/0.025, label = "Simulation signal")
    plt.xlabel('time (s)')
    plt.ylabel('Signal')

    plt.legend()
    plt.show() 


    print("R2 : ",r2_score(ref['y/d'],(dfy['s=0.250']/0.025).values[:-1]))
    print("MSE : ",mean_squared_error(ref['y/d'],(dfy['s=0.250']/0.025).values[:-1])) # Correct

    return ref,dfy


## Calculation of the simulations to be concatenated 

In [None]:
ref_TS_1_List_2,sim_TS_1_List_2 = generation(100,32100,2.15,0.6,0.3,LIST_PATH_2,0)
ref_TS_2_List_2,sim_TS_2_List_2 = generation(100,32200,2.15,3.0,0.3,LIST_PATH_2,1)
ref_TS_1_List_1,sim_TS_1_List_1 = generation(309.49288554663354,11605.8732786244,0.9535838147886668,0.6,0.3,
                                             LIST_PATH_1,5)
ref_TS_6_List_1,sim_TS_6_List_1 = generation(385.75324043420676,20922.45861417,0.9198665553364257,0.01,0.3,
                                             LIST_PATH_1,0)

## Loading of all simulations and references for modeling with concatenation 

In [None]:
list_reference = []
list_reference.append(ref_TS_1_List_2)
list_reference.append(ref_TS_2_List_2)
list_reference.append(ref_TS_1_List_1)
list_reference.append(ref_TS_6_List_1)


list_simulation = []
list_simulation.append(sim_TS_1_List_2)
list_simulation.append(sim_TS_2_List_2)
list_simulation.append(sim_TS_1_List_1)
list_simulation.append(sim_TS_6_List_1)

## Data processing

In [None]:
def preprocessing(list_dfy,list_ref):
    """
    Recovery of positive values from simulation models and reference models
     Concatenation of simulations on one side and references on the other
     Split 80/20 data 
    
    Arguments : list_dfy : List of all simulations to concatenate
                list_ref : List of all references to concatenate
                
    Sortie : X_train (80 % sim)
            X_test (20% sim)
            y_train (80% ref)
            y_test (20% ref)
               
    """
    list_sim_pos = []
    list_ref_pos = []
    for i in range(0, len(list_dfy)): # For each simulation
        sim_data = list_dfy[i]['s=0.250'].iloc[1:,]/0.025
        ref_data = list_ref[i].set_index("time").sort_index()['y/d']
        
        # Put simulation and reference in a DataFrame
        sim_ref_data = pd.concat([
        pd.DataFrame(sim_data.values,columns = ["sim"]), # Simulation values
        pd.DataFrame(ref_data.values[ref_data.values >= 0], columns = ["ref_pos"]), # Reference values above 0
        pd.DataFrame(ref_data.values[ref_data.values < 0], columns = ["ref_neg"])   # Reference values below 0
        ],axis = 1)


        pd.DataFrame(sim_data.values,columns = ["sim"])

        sim_df = sim_ref_data
        sim_pos = sim_df[sim_df.sim >= 0].sim.values # SImulation values above 0
        ref_pos = sim_df["ref_pos"].dropna().values # Delete Missing values

        # Due ton deleting the missing value we ajust the longest vector to have the number of point as the smaller one
        if(len(ref_pos) > (len(sim_pos))):
            ref_pos = ref_pos[:sim_pos.shape[0]]
        if(len(ref_pos) <= (len(sim_pos))):
            sim_pos = sim_pos[:ref_pos.shape[0]]
        list_sim_pos.append(sim_pos)
        list_ref_pos.append(ref_pos)

    #Concatenation of all simulation on one side and reference on the other
    sim_pos_final = np.concatenate((list_sim_pos))
    ref_pos_final = np.concatenate((list_ref_pos))

    # Creation of the train et test set
    X_train, X_test, y_train, y_test = train_test_split(sim_pos_final,ref_pos_final,test_size = 0.2, random_state = 143,shuffle = True)


    X_train = X_train.reshape(-1, 1)
    X_test = X_test.reshape(-1, 1)


    return X_train,X_test,y_train,y_test

## Model training

In [None]:
def train(X_train, X_test, y_train, y_test):
    
    """
    Train a RandomForest model and calculate the prediction
    Arguments : X_train, 
                X_test, 
                y_train,
                y_test
                
    Retour : y_pred (Random Forest model prediction)
    """
    reg = RandomForestRegressor()
    reg.fit(X_train,y_train)
    y_pred = reg.predict(X_test)
    
    return y_pred


## Model evaluation

In [None]:
def evaluate_model(y_pred,y_test):
    """
    Evaluation of the model prediction
    
    Arguments: y_pred
                y_test
                
    Sortie : mse : mean_squared_error
            rmse = root mean squared error
            mea = mean absolute error
            r2 = r2_score
    """
 
    mse = mean_squared_error(y_test,y_pred)
    rmse = mean_squared_error(y_test,y_pred,squared = False)
    mea = mean_absolute_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)

    return mse,rmse,mea,r2

## Model visualization with matplotlib

In [None]:

def plt_model(y_pred,y_test):
    """
    Side-by-side reference and prediction plot with indicator results 
    
    y_pred  :  numpy.ndarray  : Prédiction du modèle sur les données test du simulateur
    y_test  :  numpy.ndarray  : sortie du modèle de reference dont on doit se rapprocher
    
    
    """
    plt.figure(figsize = (20,8))
    plt.plot(y_test, label = "Reference model signal")
    plt.plot(y_pred, label = "Prediction signal")

    mse,rmse,mea,r2_score = evaluate_model(y_pred,y_test)
    mse_text = "MSE = %s " % mse
    rmse_text = "RMSE = %s " % rmse
    mea_text = "MEA = %s " % mea
    r2_text = "R2_SCORE = %s " % r2_score
  

    plt.figtext(0.5, 0.00, mse_text, ha="center", fontsize=18, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
    plt.figtext(0.5, -0.05, rmse_text, ha="center", fontsize=18, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
    plt.figtext(0.5, -0.10, mea_text, ha="center", fontsize=18, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
    plt.figtext(0.5, -0.15, r2_text, ha="center", fontsize=18, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})
        

    #plt.figtext(0.5, -0.20, "h = '{0}', tension = '{1}', u = '{2}', clo = '{3}', eps = '{4}'".format(h,tension,u,clo,eps), ha="center", fontsize=18, bbox={"facecolor":"orange", "alpha":0.5, "pad":5})

    
    
    title = " Comparaison entre le modèle de réference et la prédiction du modèle d'apprentissage Random Forest"
    plt.title(title,fontsize = 18)
    plt.xlabel("Timesteps")
    plt.ylabel("Signal")
    plt.legend()
    plt.show()


## Model visualization with Plotly

In [None]:
def plotly_model(y_pred,y_test):
    """
    Side-by-side reference and prediction plot with indicator results
    
    y_pred  :  numpy.ndarray  : Model prediction on simulator test data
    y_test  :  numpy.ndarray  : output of the reference model which we must approach
    
    
    """
    fig = go.Figure()
    fig.add_trace(go.Scatter(y = y_test,
                    name='Reference model signal'))
    fig.add_trace(go.Scatter(y = y_pred,
                    name='Prediction signal'))
    
    fig.update_layout(
        title=" Comparison between the reference model and the prediction of the Random Forest learning model",
        xaxis_title="Timesteps",
        yaxis_title="Signal",
        legend_title="Signaux",       
    )


    fig.show()
        #fig.write_html("../outputs/visualisation/modelisation_concatenee/Concat_4_TS.html")

## Pipeline de modélisation concaténée

In [None]:

X_train, X_test, y_train, y_test = preprocessing(list_simulation,list_reference)
y_pred = train(X_train, X_test, y_train, y_test)
    
plt_model(y_pred,y_test)
plotly_model(y_pred,y_test)
