In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, scale

# Function involved in pipeline
def time_series(values1, time1, values2, time2, scaling=True, tscaling=True):
    '''
    Provides a scaled data frame of data from a simulation of a problem.

    :param values1: Dictionary of orbital values for the first dataset
    :param time1: An array of time for the first dataset
    :param values2: Dictionary of orbital values for the second dataset
    :param time2: An array of time for the second dataset
    :param scaling: Boolean value of orbital value standardization scaling
    :param tscaling: Boolean value of time array normalization scaling
    :return: Tuple of two DataFrames (df1, df2) with scaled orbital values and time
    '''

    # First, create a dataframe of the values.
    df1 = pd.DataFrame(values1)
    df2 = pd.DataFrame(values2)
    
    # Add the time values to the Dataframe.
    df1['Time'] = time1
    df2['Time'] = time2
    
    # If scaling for data is True, then apply scaling
    if scaling:
        # Combine the DataFrames temporarily 
        # This ensures scaling is centered around the same mean for comparability.
        data_combined = pd.concat([df1, df2], ignore_index=True)

        # Standardized values in the combined data.
        for column in data_combined.columns:
            # Do not Standardized time, we only want to normalize it later.
            if column != 'Time':
                # Apply Standardization (zero mean, unit variance)
                data_combined[column] = scale(data_combined[column], axis=0)

        # Split back to original DataFrames.
        df1_scaled = data_combined.iloc[:len(df1)].copy()
        df2_scaled = data_combined.iloc[len(df1):].copy()
        
        # Reset dataframe's index that got split due to the concatenation
        df2_scaled = df2_scaled.reset_index(drop=True)

        # Update the old data frames with the new scaled values
        df1.update(df1_scaled)
        df2.update(df2_scaled)


    # If tscaling is set to True, normalize the time values in the dataframes.     
    if tscaling:
        # We need to fit the normalization to the dataframe with the highest time value.
        if df1['Time'].max() >= df2['Time'].max():
            # Fit the MinMaxScaler on the dataframe with the highest time value (df1).
            scaler = MinMaxScaler()
            df1['Time'] = scaler.fit_transform(df1[['Time']])
            
            # Transform the second time array (df2) using the same scaler.
            df2['Time'] = scaler.transform(df2[['Time']])

        else: 
            scaler = MinMaxScaler()
            # Fit the MinMaxScaler on the dataframe with the highest time value (df2).
            df2['Time'] = scaler.fit_transform(df2[['Time']])
            
            # Transform the first time array (df1) using the same scaler.
            df1['Time'] = scaler.transform(df1[['Time']])

    return df1, df2  # Return both DataFrames

# Density of time points may change in a simulation - could be tricky to 
# make the time-series truly generic with adaptive timesteps in simulations.
def interpolate_data(ref_df, ai_df):
    '''
    Interpolates the orbital element data in the DataFrame to match the 
    reference time array using forward filling.

    :param ref_df: Dataframe with reference solution values.
    :param ai_df: Dataframe with AI data values.
    :return: A DataFrame with interpolated data.
    '''

    # Set index to the time column for interpolation.
    ref_df.set_index('Time', inplace=True)
    ai_df.set_index('Time', inplace=True)

    # Make sure Ref data, AI data is sorted by Time.
    ref_df = ref_df.sort_values(by='Time')
    ai_df = ai_df.sort_values(by='Time')

    if ai_df.shape[0] <= ref_df.shape[0]:

        # Merge the two data frames with reference on the left and AI data on the right.
        # Use forward filling to align data points.
        merged_df = pd.merge_asof(ref_df, ai_df, on='Time', direction='forward')

        # Interpolate the values except for time.
        for col in ref_df.columns:
            if col != 'Time':
                # Interpolate the right side of the merged dataframe to the left side.
                merged_df[col + '_y'] = merged_df[col + '_y'].interpolate()

        # Split the merged dataframe to acquire the interpolated AI data
        ai_df_interpolated = merged_df[[col + '_y' for col in ai_df.columns] + ['Time']].copy()
        ai_df_interpolated.columns = list(ref_df.columns) + ['Time']

        # Return data, making sure we reset the index of the reference data set
        return ref_df.reset_index(), ai_df_interpolated

    elif ai_df.shape[0] >= ref_df.shape[0]:

        # Merge the two data frames with AI on the left and reference data on the right
        # Use nearest neighbor interpolation for potential higher accuracy.
        merged_df = pd.merge_asof(ai_df, ref_df, on='Time', suffixes=('_ai', '_ref'), direction='nearest')
        
        # Interpolate the orbital element columns using polynomial interpolation
        for col in merged_df.columns:
            if col != 'Time':
                # Interpolate the right side of the merged dataframe to the left side
                merged_df[col] = merged_df[col].interpolate(method='polynomial',order=5)

        # Split the merged dataframe to acquire the interpolated reference data
        ref_df_interpolated = pd.DataFrame()
        for col in ref_df.reset_index().columns:
           if col != 'Time':
               ref_df_interpolated[col] = merged_df[col+'_ref']
           else:
               ref_df_interpolated[col] = merged_df[col]
        # Return data, making sure we reset the index of the AI data set
        return ref_df_interpolated, ai_df.reset_index()
    

def compute_score(ref_df, ai_df, tot_rmse=False):
  '''
  Calculates the Root Mean Squared Error (RMSE) between two DataFrames,
  excluding the 'time' column.
  
  :param ref_df: Reference dataframe from solution of problem.
  :param ai_df: Dataframe from AI's solution of problem.
  :param tot_rmse: Boolean to calculate total RMSE across all columns
  :return: a single floating-point number (total RMSE) or a Series of RMSE values for each column.
  '''
  
  # Select only the columns with orbital data (exclude 'time')
  data_columns = [col for col in ref_df.columns if col != 'Time']

  # Calculate squared errors for the selected columns
  squared_errors = (ref_df[data_columns] - ai_df[data_columns])**2  

  # Calculate RMSE for each shared column
  rmse_values = np.sqrt(squared_errors.mean())
  
  if tot_rmse:
        # Calculate total RMSE by taking RMSE of individual RMSEs for each column
        total_rmse = np.sqrt(rmse_values.mean())  
        return total_rmse
  
  else:
      # Return RMSE values as a Series 
      return rmse_values