In [None]:
# optimization_plus.py

import json
import numpy as np
from math import sqrt
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
import optuna

def log_parameters(params, metrics, file_path="model_log.json"):
    """
    Logs model parameters and evaluation metrics to a JSON file.
    
    Parameters:
        params (dict): Model parameters/hyperparameters.
        metrics (dict): Evaluation metrics.
        file_path (str): Path to the log JSON file.
    """
    log_entry = {"params": params, "metrics": metrics}
    
    try:
        with open(file_path, "r") as f:
            log_data = json.load(f)
    except FileNotFoundError:
        log_data = []
    
    log_data.append(log_entry)
    
    with open(file_path, "w") as f:
        json.dump(log_data, f, indent=4)
    print(f"Logged parameters and metrics to {file_path}")

def optimize_model_gridsearch(model, param_grid, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5):
    """
    Optimizes a model's hyperparameters using GridSearchCV.
    
    Parameters:
        model: A scikit-learn-like estimator.
        param_grid (dict): Dictionary with parameter names as keys and lists of values to try.
        X_train: Training features.
        y_train: Training target.
        scoring (str): Scoring metric (default: negative RMSE).
        cv (int): Number of folds for cross-validation.
        
    Returns:
        best_params (dict): Best hyperparameters found.
        best_score (float): Best cross-validation score.
        best_estimator: The estimator refit with the best parameters.
    """
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_estimator = grid_search.best_estimator_
    return best_params, best_score, best_estimator

def optimize_model_optuna(model_class, param_grid, X_train, y_train, cv=5, n_trials=50, random_state=42):
    """
    Optimizes hyperparameters using Optuna.
    
    Parameters:
        model_class: A scikit-learn model class (not an instance).
        param_grid (dict): Dictionary with parameter names as keys and lists of possible values.
        X_train: Training features (DataFrame).
        y_train: Training target (Series).
        cv (int): Number of folds for cross-validation.
        n_trials (int): Number of trials for the Optuna study.
        random_state (int): Seed for reproducibility.
        
    Returns:
        best_params (dict): Best hyperparameters found.
        best_score (float): Best (mean) cross-validation score (negative RMSE).
        best_model: A model instance trained with the best parameters.
    """
    def objective(trial):
        # Sample parameters from the provided grid
        params = {}
        for param, values in param_grid.items():
            if isinstance(values[0], int):
                params[param] = trial.suggest_int(param, min(values), max(values))
            elif isinstance(values[0], float):
                params[param] = trial.suggest_float(param, min(values), max(values))
            else:
                params[param] = trial.suggest_categorical(param, values)
        
        model = model_class(**params)
        kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
        scores = []
        for train_index, test_index in kf.split(X_train):
            X_tr, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
            y_tr, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_val)
            rmse = np.sqrt(mean_squared_error(y_val, y_pred))
            scores.append(-rmse)  # Negative RMSE (Optuna maximizes the objective)
        return np.mean(scores)
    
    study = optuna.create_study(
        direction="maximize", 
        study_name="hyperparameter_optimization", 
        sampler=optuna.samplers.TPESampler(seed=random_state)
    )
    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    best_score = study.best_value
    best_model = model_class(**best_params)
    best_model.fit(X_train, y_train)
    return best_params, best_score, best_model

# Testing the module if run directly.
if __name__ == "__main__":
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor

    # Load a sample processed dataset (ensure "processed_data.csv" exists)
    df = pd.read_csv("processed_data.csv")
    X = df.drop(columns=["resale_price"])
    y = df["resale_price"]

    # Define a parameter grid for RandomForestRegressor
    param_grid_rf = {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10]
    }
    
    # --- Grid Search Optimization ---
    best_params_grid, best_score_grid, best_model_grid = optimize_model_gridsearch(
        RandomForestRegressor(random_state=42), 
        param_grid_rf, 
        X, 
        y, 
        scoring='neg_root_mean_squared_error',
        cv=5
    )
    print("GridSearchCV Best Parameters:", best_params_grid)
    print("GridSearchCV Best Score (negative RMSE):", best_score_grid)
    
    log_parameters(best_params_grid, {"best_score": best_score_grid}, file_path="gridsearch_log.json")
    
    # --- Optuna Optimization ---
    best_params_opt, best_score_opt, best_model_opt = optimize_model_optuna(
        RandomForestRegressor, 
        param_grid_rf, 
        X, 
        y, 
        cv=5, 
        n_trials=20, 
        random_state=42
    )
    print("Optuna Best Parameters:", best_params_opt)
    print("Optuna Best Score (negative RMSE):", best_score_opt)
    
    log_parameters(best_params_opt, {"best_score": best_score_opt}, file_path="optuna_log.json")
