# Cosmetics Generative Formulation
## Microbiological evolution prediction

In [None]:
import os
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
os.chdir('/home/data/RTV/DEMOR424/BIOVIA/BPP2022/public/cpgretail/Cosmetics Generative Formulation/')

In [None]:
# WARNING: DO NOT REMOVE OR MODIFY.
runningInJupyter = True

from plp_jupyter_data_loader import loadPipelinePilotData
plp_df, plp_params, plp_globals = loadPipelinePilotData()
import pandas as pd
if plp_df.shape[0] > 0:
    print("plp_df.dtypes:\n" + plp_df.dtypes.to_string())

### 1. Retrieve training dataset

In [None]:
to_drop = ['formula_id']

targets = [
    'Staphylococcus aureus cfu g',
    'Pseudomonas aeruginosa cfu g',
    'Escherichia coli cfu g',
    'Candida albicans cfu g',
    'Aspergillus niger cfu g',
    'Enterococcus faecalis cfu g',
]

In [None]:
X = plp_df.drop(columns=targets+to_drop)

### 2. Retrieve targets train models
Using grid search for hyperparameter tuning.

In [None]:
dic_metrics = {}
for tgt in targets:
    y = plp_df[tgt]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=2025
    )

    # Initialize the Gradient Boosting model
    model = GradientBoostingRegressor(random_state=2025)


    # Define the parameter grid for GridSearchCV
    param_grid = {
        'n_estimators': [200, 500],
        'learning_rate': [0.01, 0.1],
        'max_depth': [5, 7]
    }


    # Set up GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5, 
        scoring='neg_mean_squared_error',
        verbose=1
    )


    # Fit the model to the training data
    print("Starting grid search...")
    grid_search.fit(X_train, y_train)

    
    
    # Save the best parameters and the corresponding score
    with open('./Models/Metrics/{}_best_params.json'.format(tgt), 'w') as f:
        # write the dictionary to the file in JSON format
        json.dump(grid_search.best_params_, f)
        

    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    
    # Compute evaluation metrics
    dic_metrics[tgt] = {
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }


    # Save the model to disk
    filename = './Models/model_gradient_boosting_{}.sav'.format(tgt)
    pickle.dump(best_model, open(filename, 'wb'))
    print('Model saved to disk.')

In [None]:
# Save metrics file
with open('./Models/Metrics/microbiology_metrics.json', 'w') as f:
    # write the dictionary to the file in JSON format
    json.dump(dic_metrics, f)

In [None]:
plp_df = pd.DataFrame(dic_metrics)  # Return metrics dataset