In [6]:
import pandas as pd
import joblib 
from sklearn.linear_model import LinearRegression 



# --- 1. Load Processed Training Data ---

try:
    x_train = joblib.load('../models/x_train.pkl')
    x_test=joblib.load('../models/x_test.pkl')
    y_train=joblib.load('../models/y_train.pkl')
    y_train_p=y_train[['protein feed']]
    y_test=joblib.load('../models/y_test.pkl')
    y_test_p=y_test[['protein feed']]

except FileNotFoundError:
    print("-" * 60)
    print("FATAL ERROR: Processed data files not found.")
    print("Ensure the pipeline_exporter script has been run successfully to create these PKL files.")
    print("-" * 60)
    raise

print(f"‚úÖ Successfully loaded features ({x_train.shape}) and target ({y_train_p}).")
print("-" * 60)


# --- 2. Initialize, Train, and Export Model ---

# GRID SEARCH SVM
import joblib 
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV # <--- NEW IMPORT
import numpy as np


# ----------------------------------------------

# --- 1. Define the Parameter Grid for Grid Search ---
# This dictionary holds the hyperparameters and the values you want to test.
param_grid = {
    'C': [1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.1, 1], # Kernel coefficient
    'kernel': ['rbf'] # For SVR, 'rbf' is commonly used
}

# --- 2. Define the Base Model ---
base_model = SVR()

# --- 3. Grid Search and Training Loop ---
print("Starting Grid Search CV and training of 6 individual SVR models...")
print("-" * 60)

# NOTE: Since your original snippet only shows training for a single model
# (using x_train and y_train_p), the example below simulates the loop structure
# you would need for all six targets.

# Assuming you are iterating through targets like this:
# for i, target_col in enumerate(TARGET_COLUMNS):
#     y_train_p = Y_for_fit_train.iloc[:, i] 
#     y_test_p = Y_for_fit_test.iloc[:, i]
#     ...

# --- Example for a single target (as in your original snippet) ---

# Initialize the GridSearchCV object
# cv=5 means 5-fold cross-validation
# scoring='neg_mean_squared_error' is common for regression (it maximizes the negative MSE, 
# which is equivalent to minimizing the positive MSE/RMSE)
grid_search = GridSearchCV(
    estimator=base_model, 
    param_grid=param_grid, 
    scoring='neg_mean_squared_error',
    cv=5, 
    verbose=1,
    n_jobs=-1 # Use all available cores
)

# Fit Grid Search to the training data
# This step performs the cross-validation across all parameter combinations
grid_search.fit(x_train, y_train_p) 

# The best model found by the grid search
best_model = grid_search.best_estimator_

# Get the best parameters
best_params = grid_search.best_params_
print(f" üèÜ Best Parameters Found: {best_params}")

# Optional: Quick Evaluation on Testing Data (for confidence)
y_pred_p = best_model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test_p, y_pred_p))
    
print(f" üéØ (Test RMSE: {rmse:.4f})")

# Export the best model (e.g., using a target-specific filename)
# model_filename = f'svr_model_{target_col}.pkl'
# joblib.dump(best_model, model_filename)
# print(f" üíæ Model saved as {model_filename}")

print("-" * 60)
print("Grid Search CV complete. The best SVR model is trained and ready for export.")

# C. Export the Fitted Model
# This is the file you need to upload as 'protein.pkl' (or rename it after export)
joblib.dump(best_model, 'protein_feed.pkl')

print("‚úÖ Trained model saved to protein_feed.pkl. Ready for prediction.")
print("-" * 60)

‚úÖ Successfully loaded features ((870, 42)) and target (      protein feed
834              1
552            -35
855             -2
215            -10
256             10
...            ...
330            -10
466            -19
121              3
1044            -8
860             13

[870 rows x 1 columns]).
------------------------------------------------------------
Starting Grid Search CV and training of 6 individual SVR models...
------------------------------------------------------------
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  y = column_or_1d(y, warn=True)


 üèÜ Best Parameters Found: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
 üéØ (Test RMSE: 3.3498)
------------------------------------------------------------
Grid Search CV complete. The best SVR model is trained and ready for export.
‚úÖ Trained model saved to protein_feed.pkl. Ready for prediction.
------------------------------------------------------------


In [9]:
import pandas as pd
import joblib
import numpy as np

# --- Configuration (Names of your deployed assets) ---
PREPROCESSOR_FILE = '../models/preprocessor.pkl'
MODEL_FILE = '../models/protein_feed.pkl' # Assuming this is your final SVR model file

# --- 1. Define Example Raw Input Data ---
# This dictionary must contain all the original feature columns 
# in the format they would arrive from a web form or database.
example_raw_input = pd.DataFrame({
    'BLS Code': [41],             # Example BLS Code
    'Scenario': ['CM3-A'],        # Must be a category seen in training
    'Time_Slice': ['2080'],       # Needs scaling
    'CO2 effects': ['Yes'],
    'CO2 ppm': [712],             # Needs scaling
    'Adaptation': ['Level 1'],
    # Note: Other columns like 'wheat', 'rice', etc., are targets and NOT included here.
})

# --- 2. Load the Deployed Assets ---
try:
    preprocessor = joblib.load(PREPROCESSOR_FILE)
    model_protein_feed = joblib.load(MODEL_FILE)
    print(f"‚úÖ Successfully loaded '{PREPROCESSOR_FILE}' and '{MODEL_FILE}'.")
except FileNotFoundError:
    print("-" * 60)
    print("FATAL ERROR: Could not find one or more PKL files.")
    print("Please ensure your pipeline and model export steps ran successfully.")
    print("-" * 60)
    raise

# --- 3. Execute the Full Prediction Flow ---

print("\nStarting Validation Flow:")
print("-" * 30)
print(f"Raw Input: {example_raw_input[['Scenario', 'CO2 effects', 'CO2 ppm', 'Time_Slice', 'Adaptation']].values[0]}")
print("-" * 30)

# A. Preprocessing: Transform the raw data (9 features) into the clean format (43 features)
# This is the crucial step where the preprocessor checks if it's running well.
X_new_processed_array = preprocessor.transform(example_raw_input)

# Optional: Inspect the processed output (helps debugging order/scaling issues)
X_new_processed_df = pd.DataFrame(
    X_new_processed_array, 
    columns=preprocessor.named_steps['preprocessor'].get_feature_names_out()
)
print(f"1. Preprocessing complete. Output shape: {X_new_processed_array.shape}")
print(f"   (First 5 processed values: {X_new_processed_array[0][:5]})")


# B. Prediction: Pass the cleaned array to the fitted SVR model
prediction = model_protein_feed.predict(X_new_processed_array)

print("\n--- Final Prediction Result ---")
print(f"Predicted change in 'Protein Feed' yield: {prediction[0]:.2f} %")
print("-------------------------------")

# --- Verification of Pipeline Health ---
# If the code runs without an error (especially KeyError or ValueError during transform), 
# the pipeline is healthy and ready for deployment.
print("Conclusion: The pipeline successfully transformed new data and generated a prediction.")

‚úÖ Successfully loaded '../models/preprocessor.pkl' and '../models/protein_feed.pkl'.

Starting Validation Flow:
------------------------------
Raw Input: ['CM3-A' 'Yes' 712 '2080' 'Level 1']
------------------------------
1. Preprocessing complete. Output shape: (1, 42)
   (First 5 processed values: [1. 1. 0. 0. 1.])

--- Final Prediction Result ---
Predicted change in 'Protein Feed' yield: -0.10 %
-------------------------------
Conclusion: The pipeline successfully transformed new data and generated a prediction.


