In [42]:
# Increase the proportion of enzymes in the reaction system
experiment_3 = {
    'Enzyme_Addition': [10, 10, 10, 10, 10, 10, 15, 15, 15, 15, 15, 15, 30, 30, 30, 30, 30, 30, 45, 45, 45, 45, 45, 45, 60, 60, 60, 60, 60, 60],
    'Initial_Substrate_Conc': [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6],
    'Final_Substrate_Conc': [0.6, 1.4, 2.6, 2.0, 0.9, 0.8, 0.8, 1.7, 2.7, 3.2, 3.6, 3.4, 0.5, 1.3, 2.5, 3.0, 3.4, 2.8, 0.3, 1.0, 2.2, 2.6, 3.0, 2.7, 0.5, 1.2, 2.2, 2.6, 3.0, 2.3],
    'Final_Product_Conc': [0.319, 0.456, 0.557, 0.425, 0.196, 0.196, 0.120, 0.188, 0.432, 0.395, 0.566, 0.576, 0.387, 0.554, 0.792, 0.853, 0.991, 1.161, 0.544, 0.819, 0.885, 1.241, 1.161, 1.303, 0.275, 0.485, 0.995, 1.201, 1.322, 1.072],
    'Conversion_Rate': [42.42, 32.33, 12.45, 50.5, 82.15, 86.93, 21.34, 16.10, 9.03, 18.89, 28.54, 43.08, 48.74, 35.33, 16.01, 24.57, 31.96, 53.34, 69.27, 50.18, 27.64, 35.55, 40.46, 55.03, 53.81, 41.58, 25.54, 35.47, 40.75, 61.97],
    'Productivity': [31.86, 22.78, 18.55, 10.63, 3.93, 3.27, 11.98, 9.40, 14.39, 9.88, 11.33, 9.61, 38.73, 27.70, 26.39, 21.33, 19.81, 13.75, 54.4, 40.93, 29.49, 31.03, 23.23, 21.71, 27.52, 24.27, 33.17, 30.03, 26.43, 17.87],
    'MeOH_Conc': [20] * 30
}

In [43]:
import pandas as pd

df_exp = pd.DataFrame(experiment_3)

In [44]:
import numpy as np
from sklearn.model_selection import KFold, ShuffleSplit, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Method to return an integer value between 1 - 10 depending on the percentage out of 100
def get_rating(x):
    rating = int((x/100)*10) + 1
    return min(max(rating, 1), 10)

# Will return a mult-dimenstional array
def GenerateRandomData(number_of_times):
    # Get the mean and standard deviation of each column in the data frame
    df3_means = df_exp.mean()
    df3_stdv = df_exp.std()

    output_data ={}

    # Declare a new array to hold the dummy data
    for col in df_exp.columns:
        output_data[col] = np.random.normal(loc = df3_means[col], scale = df3_stdv[col], size = number_of_times)

    return output_data

def CleanAndCalculateDF(new_df):
    # Drop the columns
    new_df = new_df.drop(columns = ['Conversion_Rate', 'Productivity'])
    # Add Calculated Conversion Rate Column
    new_df['Calc_Conversion_Rate'] = (new_df['Initial_Substrate_Conc'] - new_df['Final_Substrate_Conc']) / new_df['Initial_Substrate_Conc']

    # Add Calculated Productivity Column
    new_df['Calc_Productivity'] = new_df['Final_Product_Conc'] / new_df['Initial_Substrate_Conc']

    # Converting undefined values as NaN
    new_df.replace(-float('inf'), np.nan, inplace = True)
    new_df.replace(float('inf'), np.nan, inplace = True)

    # Dropping NaN
    new_df.dropna(inplace = True)

    # We will also remove MeOH_Conc because it stays constant throghout the experiment
    new_df.drop(columns = ['MeOH_Conc'], inplace = True)

    # Depending ont the conversion rate and productivity, the output value for Rating changes
    new_df['conversion'] = (new_df['Calc_Conversion_Rate']*0.5) + (new_df['Calc_Productivity']*0.5)

    # Applying the method above to generate random ratings based off the Conversion Rate & Productivity columns onto the final version of the dataframe
    df3_Final = new_df.copy()
    df3_Final['Rating'] = new_df['conversion'].apply(get_rating)

    return df3_Final

def GetToModeling(df):
    cv = ShuffleSplit(n_splits = 10, test_size = 0.2)

    #Splitting into train and test sets
    X = df.loc[:,df.columns != 'Rating']     
    y = df['Rating']  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

    #Linear Regression
    model1 = LinearRegression()
    model1.fit(X_train, y_train)
    cross_val_scores1 = cross_val_score(model1, X_train, y_train, cv = cv)
    train_accuracy1 = model1.score(X_train, y_train)
    test_accuracy1 = model1.score(X_test, y_test)

    #Random Forest
    model2 = RandomForestRegressor(random_state = True, warm_start = True, criterion = 'absolute_error', max_depth = 100)
    model2.fit(X_train, y_train)
    cross_val_scores2 = cross_val_score(model2, X_train, y_train, cv = cv)
    train_accuracy2 = model2.score(X_train, y_train)
    test_accuracy2 = model2.score(X_test, y_test)

    #KNN
    model3 =KNeighborsRegressor()
    model3.fit(X_train, y_train)
    cross_val_scores3 = cross_val_score(model3, X_train, y_train, cv = cv)
    train_accuracy3 = model3.score(X_train, y_train)
    test_accuracy3 = model3.score(X_test, y_test)

    #Decision Tree
    model4 = DecisionTreeRegressor()
    model4.fit(X_train, y_train)
    cross_val_scores4 = cross_val_score(model4, X_train, y_train, cv = cv)
    train_accuracy4 = model4.score(X_train, y_train)
    test_accuracy4 = model4.score(X_test, y_test)

    #SVR
    model5 = svm.SVR()
    model5.fit(X_train, y_train)
    cross_val_scores5 = cross_val_score(model5, X_train, y_train, cv = cv)
    train_accuracy5 = model5.score(X_train, y_train)
    test_accuracy5 = model5.score(X_test, y_test)

    Linear_Results = {
        'Model': 'Linear Regression',
        'Train Accuracy': train_accuracy1,
        'Test Accuracy': test_accuracy1,
        'Cross-validation': cross_val_scores1.mean()}
    RF_Results = {
        'Model': 'RF',
        'Train Accuracy': train_accuracy2,
        'Test Accuracy': test_accuracy2,
        'Cross-validation': cross_val_scores2.mean()}
    KNN_Results = {
        'Model': 'KNN',
        'Train Accuracy': train_accuracy3,
        'Test Accuracy': test_accuracy3,
        'Cross-validation': cross_val_scores3.mean()}
    DT_Results = {
        'Model': 'DT',
        'Train Accuracy': train_accuracy4,
        'Test Accuracy': test_accuracy4,
        'Cross-validation': cross_val_scores4.mean()}
    SVR_Results = {
        'Model': 'SVR',
        'Train Accuracy': train_accuracy5,
        'Test Accuracy': test_accuracy5,
        'Cross-validation': cross_val_scores5.mean()}

    result = []
    result.append(Linear_Results)
    result.append(RF_Results)
    result.append(KNN_Results)
    result.append(DT_Results)
    result.append(SVR_Results)

    return result





In [46]:
import pandas as pd

# Define your method that generates a DataFrame
def generate_dataframe():
    # Declare the amount of dummy data to be created
    num_of_samples = 1000

    df = pd.DataFrame(GenerateRandomData(num_of_samples))
    df = CleanAndCalculateDF(df)
    df_Accuracy = pd.DataFrame(GetToModeling(df))
    return df_Accuracy

# Number of times to run the method
n = 100

# Initialize an empty DataFrame to store the concatenated results
combined_df = pd.DataFrame()

# Loop to run the method 'n' times and concatenate the DataFrames
for _ in range(n):
    df_iterate = generate_dataframe()
    
    # Concatenate the current DataFrame to the combined DataFrame
    combined_df = pd.concat([combined_df, df_iterate], ignore_index=True)

# Define the file path where you want to save the Excel file
excel_file_path = 'output_data.xlsx'

# Save the combined DataFrame to an Excel sheet
combined_df.to_excel(excel_file_path, index=False)
