# Cosmetics Generative Formulation
## Phyisiochemical properties prediction

In [None]:
import os
import json
import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
os.chdir('/home/data/RTV/DEMOR424/BIOVIA/BPP2022/public/cpgretail/Cosmetics Generative Formulation/')

In [None]:
# WARNING: DO NOT REMOVE OR MODIFY.
runningInJupyter = True

from plp_jupyter_data_loader import loadPipelinePilotData
plp_df, plp_params, plp_globals = loadPipelinePilotData()
import pandas as pd
if plp_df.shape[0] > 0:
    print("plp_df.dtypes:\n" + plp_df.dtypes.to_string())

### 1. Retrieve training dataset

In [None]:
targets = [
#     'Play Time (s)',
#     'Stickiness',
#     'Slippery Finish',
#     'Spreadability',
#     'Viscosity (Pa.s)',
#     'pH',
#     'Color',
#     'Odor',
    'homogeneity'
]

to_drop = [
    'formula_id',
    'homogeneity_proba',
    'Stability'
]

In [None]:
X = plp_df.drop(columns=targets+to_drop).values

### 2. Retrieve targets train models
Using grid search for hyperparameter tuning.

In [None]:
dic_metrics = {}
for tgt in targets:
    y = plp_df[tgt].astype(int).values

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=2025
    )

    # Initialize the Gradient Boosting model
    if tgt == 'homogeneity': 
        model = GradientBoostingClassifier(random_state=2025)
    else: 
        model = GradientBoostingRegressor(random_state=2025)


    # Define the parameter grid for GridSearchCV
    param_grid = {
#         'n_estimators': [100, 200, 500, 1000],
#         'learning_rate': [0.01, 0.1, 0.2],
#         'max_depth': [3, 5, 7, 9],
        'n_estimators': [1000],
        'learning_rate': [0.1],
        'max_depth': [5],
#       'min_samples_split': [2, 5, 10],
#       'min_samples_leaf': [1, 2, 4]
    }

    
    # Set up GridSearchCV
    if tgt == 'homogeneity':
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=5,
            scoring='accuracy',
            verbose=1
        )
    else:
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=5, 
            scoring='neg_mean_squared_error',
            verbose=1
        )


    # Fit the model to the training data
    print("Starting grid search...")
    grid_search.fit(X_train, y_train)
    
    
    # Save the best parameters and the corresponding score
    with open('./Models/Metrics/{}_best_params.json'.format(tgt), 'w') as f:
        # write the dictionary to the file in JSON format
        json.dump(grid_search.best_params_, f)
    
    
    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    

    if tgt == 'homogeneity':
        dic_metrics[tgt] = {
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred),
            "Accuracy": accuracy_score(y_test, y_pred)
        } 
    else:
        dic_metrics[tgt] = {
            "MSE": mean_squared_error(y_test, y_pred),
            "MAE": mean_absolute_error(y_test, y_pred),
            "R2": r2_score(y_test, y_pred)
        }       
        

    # Save metrics file
    with open('./Models/Metrics/{}_metrics.json'.format(tgt), 'w') as f:
        # write the dictionary to the file in JSON format
        json.dump(dic_metrics, f)
        
        
    # Save the model to disk
    filename = './Models/model_gradient_boosting_{}.sav'.format(tgt)
    pickle.dump(best_model, open(filename, 'wb'))
    print('Model saved to disk.')

In [None]:
plp_df = pd.DataFrame(dic_metrics)  # Return metrics dataset