PIPELINE


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import optuna
import joblib

data = pd.read_csv(r"C:\Users\Avijit\Desktop\AgriAssure\AGRIYIELDPRO\PROCESSED_DATASET\yield5.csv")


# --- Data Preparation ---
# Separate features (x) and target (y)
x = data.iloc[:, :-1]
y = data.iloc[:, -1:]
y1 = np.reshape(y, newshape=(-1))

# Split the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y1, train_size=0.8, random_state=32)

# --- Define the Hyperparameter Optimization Objective Function ---
def objective(trial):
    """
    This function defines the search space for Optuna.
    Optuna will try different combinations of these hyperparameters
    to find the best model performance.
    """
    # Suggest a range of values for each hyperparameter
    param = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 20),
        'l2_regularization': trial.suggest_loguniform('l2_regularization', 1e-3, 10.0),
    }

    # Create the pipeline with the suggested parameters
    # A StandardScaler is included as a good practice, even though
    # tree-based models are not sensitive to feature scaling.
    pipeline = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('regressor', HistGradientBoostingRegressor(**param))
    ])

    # Use K-Fold Cross-Validation to get a robust score for the trial
    cv_folds = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(pipeline, xtrain, ytrain, cv=cv_folds, scoring='r2', n_jobs=-1)

    # Optuna minimizes the objective function, so we return the negative R2 score
    return np.mean(score) * -1

# --- Run the Optuna Study ---
print("Starting hyperparameter optimization with Optuna...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)
print("Optimization complete.")

# --- Get the Best Hyperparameters ---
best_params = study.best_params
best_score = -study.best_value
print("\n--- Best Results ---")
print(f"Best R-squared score found: {best_score:.4f}")
print("Best hyperparameters found:")
for key, value in best_params.items():
    print(f"  {key}: {value:.4f}")

# --- Train the Final Pipeline with the Best Parameters ---
print("\nTraining the final model with the best parameters...")
final_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('regressor', HistGradientBoostingRegressor(**best_params))
])
final_pipeline.fit(xtrain, ytrain)
print("Final model training complete.")

# --- Evaluate the Final Model on the Test Set ---
train_score = final_pipeline.score(xtrain, ytrain)
test_score = final_pipeline.score(xtest, ytest)
print(f"Final R-squared on training data: {train_score:.4f}")
print(f"Final R-squared on testing data: {test_score:.4f}")

# --- Save the Entire Optimized Pipeline ---
# Saving the entire pipeline ensures both the scaler and the regressor are saved together.
joblib.dump(final_pipeline, r"C:\Users\Avijit\Desktop\AgriAssure\AGRIYIELDPRO\MODEL\histgradientboostingV3.pkl")
print("\nOptimized pipeline saved to optimized_histgradboosting_pipeline.pkl")

