# Data sorting algorithm

### 1) Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### 2) Interpolator function:

The dataset consists of uneven time increments and thus the important columns such as S8,S4,S2,S1,V,Sp require interpolated values for a constant time increment.

In [2]:
def interpolator(dataset, time_increment):
    # Ensure dataset is a NumPy array and storing current and cycle columns
    dataset = np.array(dataset)
    current_value = dataset[0, 7]
    cycle_number_value = dataset[0, 8]

    # Calculate the minimum and maximum time values in the array
    min_time = dataset[:, 6].min()  
    max_time = dataset[:, 6].max()

    # Create a new array with the desired time increments
    new_time_values = np.arange(min_time, max_time, time_increment)

    # Forming an array for interpolated values
    interpolated_values = np.zeros((len(new_time_values), 6))

    # Interpolate columns
    for col_index in range(6):  # Adjust the range if the number of columns to interpolate differs
        interpolated_values[:, col_index] = np.interp(new_time_values, dataset[:, 6], dataset[:, col_index])

    # Combine interpolated values with constant values and new time values
    interpolated_data = np.column_stack((interpolated_values, new_time_values, np.full(len(new_time_values), current_value),
                                         np.full(len(new_time_values), cycle_number_value)))

    # Convert the result to a pandas DataFrame
    column_names = ['S8_cur', 'S4_cur','S2_cur','S1_cur','V_cur', 'Sp_cur', 't_cur','I', 'cycle number']  # Adjust column names as needed
    interpolated_df = pd.DataFrame(interpolated_data, columns=column_names)

    return interpolated_df

### 3) Functions for Nernst potentials and BV partial currents

In [3]:
# Defined in the code
EH0 = 2.35
EL0 = 2.195
iH0 = 10
iL0 = 5
R = 8.3145
T = 298
F = 96490
fh = 0.7296
fl = 0.06654
ar = 0.96

def high_Nernst(EH0,R,T,F,fh,S8,S4):
  EH = EH0 + (((R*T)/(4*F))*np.log(fh*(S8/(S4**2))))
  return EH

def low_Nernst(EL0,R,T,F,fl,S4,S2,S):
  EL = EL0 + (((R*T)/(4*F))*np.log(fl*(S4/((S**2)*S2))))
  return EL

def high_BV(iH0,ar,F,R,T,V,EH):
  iH = 2*iH0*ar*np.sinh((4*F*(V-EH))/(2*R*T))
  return iH

def low_BV(iL0,ar,F,R,T,V,EL):
  iL = 2*iL0*ar*np.sinh((4*F*(V-EL))/(2*R*T))
  return iL

### 4) Scaler function for X and y

In [4]:
def scaler_func(X,y,current):
    
    # Scaling dataset for all the cycles combined (specific current)
    scale_columns_cur_X = ['S8_cur', 'S4_cur', 'S2_cur', 'S1_cur', 'V_cur', 'Sp_cur','EH','EL','iH','iL']
    scale_columns_cur_y = ['S8_cur', 'S4_cur', 'S2_cur', 'S1_cur', 'Sp_cur', 'V_cur']

    means = X[scale_columns_cur_X].mean()
    stds = X[scale_columns_cur_X].std()

    # Create a new DataFrame to store these values
    scales = pd.DataFrame([means, stds], index=['mean', 'std'])

    for i in scale_columns_cur_X:
        X[i] = (X[i] - scales.loc['mean', i]) / scales.loc['std', i]


    for j in scale_columns_cur_y:
        y[j] = (y[j] - scales.loc['mean', j])/scales.loc['std', j]

    scales['I'] = current

    return scales,X,y

### 5) Data formatter for each current and multiple cycles

In [5]:
def Data_formatter_cycles(current, data):

    X_cycles = pd.DataFrame(columns=['S8_cur', 'S4_cur', 'S2_cur', 'S1_cur', 'V_cur', 'Sp_cur', 'I', 'EH', 'EL', 'iH', 'iL'])
    y_cycles = pd.DataFrame(columns=['S8_cur', 'S4_cur', 'S2_cur', 'S1_cur', 'Sp_cur' , 'V_cur'])

    # Extract unique cycle numbers to know how many times data formatting is required for particular cycle number
    unique_cycles = data['cycle'].unique()
    cycles = len(unique_cycles)

    # Defining scaling parameters
    scales = pd.DataFrame(columns=['S8_cur', 'S4_cur', 'S2_cur', 'S1_cur', 'V_cur', 'Sp_cur', 'I', 'EH', 'EL', 'iH', 'iL'])

    # Defining parameter
    scaler = StandardScaler()

    for i in range(cycles):

        # Define the input dataset for the particular cycle
        X_array = data[data['I'] == current]
        X_array = X_array[X_array['cycle'] == i]
        X_array = X_array.iloc[1:,1:] # Removing first row as it has erraneous values


        # Linearly interpolating timescale for even data
        time_increment = 0.05
        X_array_interpolated = interpolator(X_array, time_increment)

        # Apply the functions to the desired columns
        X_array_interpolated['EH'] = X_array_interpolated.apply(lambda row: high_Nernst(EH0, R, T, F, fh, row[0], row[1]), axis=1)
        X_array_interpolated['EL'] = X_array_interpolated.apply(lambda row: low_Nernst(EL0, R, T, F, fl, row[1], row[2], row[3]), axis=1)
        X_array_interpolated['iH'] = X_array_interpolated.apply(lambda row: high_BV(iH0, ar, F, R, T, row[4], row['EH']), axis=1)
        X_array_interpolated['iL'] = X_array_interpolated.apply(lambda row: low_BV(iL0, ar, F, R, T, row[4], row['EL']), axis=1)

        # Defining input array for NN
        filter_columns_cur = ['S8_cur', 'S4_cur', 'S2_cur', 'S1_cur', 'V_cur', 'Sp_cur','I','EH','EL','iH','iL']
        X_array_interpolated = X_array_interpolated[filter_columns_cur]

        # Defining output array
        filter_columns_nxt = ['S8_cur', 'S4_cur', 'S2_cur', 'S1_cur','Sp_cur', 'V_cur']
        y_interpolated = X_array_interpolated[filter_columns_nxt]

        # Setting current input and next output
        y_interpolated = y_interpolated.iloc[1:,:]
        X_array_interpolated = X_array_interpolated.iloc[:-1,:]
        
        # Removing first value as it is erraneous
        y_interpolated = y_interpolated.iloc[1:,:]
        X_array_interpolated = X_array_interpolated.iloc[1:,:]

        # Append interpolated data to X
        X_cycles = pd.concat([X_cycles, X_array_interpolated[X_cycles.columns]], ignore_index=True)

        # Append subset of interpolated data to y
        y_cycles = pd.concat([y_cycles, y_interpolated[y_cycles.columns]], ignore_index=True)


    # scales,X,y = scaler_func(X_cycles,y_cycles,current)

    return X_cycles, y_cycles




### 6) Saving data into an excel file to check

In [6]:
data = pd.read_excel('C:/Users/ADITYA/OneDrive - Imperial College London/Year 4/FYP/Final-year-project/VScode//Data/Dataset.xlsx')
current = 1.6

X_cycles, y_cycles = Data_formatter_cycles(current, data)

print(y_cycles)

# Join the DataFrames side by side
combined_df_unscaled = pd.concat([X_cycles, y_cycles], axis=1)

# Save the combined DataFrame as an Excel file for unscaled data
combined_df_unscaled.to_excel('C:/Users/ADITYA/OneDrive - Imperial College London/Year 4/FYP/Final-year-project/VScode//Data/Dataset_unscaled.xlsx', index=False)


scales,X,y = scaler_func(X_cycles,y_cycles,current)


# Join the DataFrames side by side
combined_df_scaled = pd.concat([X, y], axis=1)

# Save the combined DataFrame as an Excel file
combined_df_scaled.to_excel('C:/Users/ADITYA/OneDrive - Imperial College London/Year 4/FYP/Final-year-project/VScode//Data/Dataset_scaled.xlsx', index=False)

scales.to_excel('C:/Users/ADITYA/OneDrive - Imperial College London/Year 4/FYP/Final-year-project/VScode/Data/Scales.xlsx', index=False)

  X_array_interpolated['EH'] = X_array_interpolated.apply(lambda row: high_Nernst(EH0, R, T, F, fh, row[0], row[1]), axis=1)
  X_array_interpolated['EL'] = X_array_interpolated.apply(lambda row: low_Nernst(EL0, R, T, F, fl, row[1], row[2], row[3]), axis=1)
  X_array_interpolated['iH'] = X_array_interpolated.apply(lambda row: high_BV(iH0, ar, F, R, T, row[4], row['EH']), axis=1)
  X_array_interpolated['iL'] = X_array_interpolated.apply(lambda row: low_BV(iL0, ar, F, R, T, row[4], row['EL']), axis=1)
  X_cycles = pd.concat([X_cycles, X_array_interpolated[X_cycles.columns]], ignore_index=True)
  y_cycles = pd.concat([y_cycles, y_interpolated[y_cycles.columns]], ignore_index=True)
  X_array_interpolated['EH'] = X_array_interpolated.apply(lambda row: high_Nernst(EH0, R, T, F, fh, row[0], row[1]), axis=1)
  X_array_interpolated['EL'] = X_array_interpolated.apply(lambda row: low_Nernst(EL0, R, T, F, fl, row[1], row[2], row[3]), axis=1)
  X_array_interpolated['iH'] = X_array_interpolated.apply

          S8_cur    S4_cur    S2_cur        S1_cur    Sp_cur     V_cur
0       2.688960  0.002940  0.002697  3.522778e-09  0.000003  2.428108
1       2.688880  0.003020  0.002697  3.668722e-09  0.000003  2.427763
2       2.688800  0.003100  0.002697  3.815523e-09  0.000003  2.427427
3       2.688720  0.003180  0.002697  3.964209e-09  0.000003  2.427100
4       2.688640  0.003260  0.002697  4.114778e-09  0.000003  2.426781
...          ...       ...       ...           ...       ...       ...
144008  0.000246  1.406538  0.645255  1.941675e-04  0.642366  2.290260
144009  0.000246  1.406511  0.645268  1.941655e-04  0.642380  2.290259
144010  0.000246  1.406485  0.645282  1.941636e-04  0.642393  2.290259
144011  0.000246  1.406458  0.645295  1.941616e-04  0.642406  2.290259
144012  0.000246  1.406432  0.645308  1.941597e-04  0.642419  2.290259

[144013 rows x 6 columns]
