In [35]:
import pandas as pd
import numpy as np
import joblib # Required for saving the pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- 1. Data Loading and Preparation (Preprocessing outside the pipeline) ---

# NOTE: This code assumes the 'final.xlsx' file is accessible.
data = pd.read_excel('../data/final.xlsx')

# Pre-pipeline cleaning: Replace 'Equilibrium' and ensure float type
data['Time_Slice'] = data['Time_Slice'].replace('Equilibrium', 2150.0).astype(float)

# --- 2. Define Features (X) and Targets (Y) ---

TARGET_COLUMNS = ['wheat', 'rice', 'coarse grains', 'protein feed', 'grains', 'four commo-dities']
INPUT_FEATURES = [
    'BLS Code', 'Scenario', 'Time_Slice', 'CO2 effects', 'CO2 ppm', 'Adaptation'
]
X_raw = data[INPUT_FEATURES] 

Y_raw = data[TARGET_COLUMNS]

# Perform Train-Test Split (CRUCIAL for fitting only on training data)
X_train_raw, X_test_raw, Y_train, Y_test = train_test_split(
    X_raw, Y_raw, test_size=0.2, random_state=42
)

def predict_single_output(raw_input_data: dict, target_name: str) -> dict:
    """
    Transforms raw input data using the preprocessor and runs prediction 
    using the requested individual model.

    Args:
        raw_input_data: A dictionary containing the raw feature values 
                        (e.g., {'BLS Code': 9, 'Scenario': 'GISS', ...}).
        target_name: The target column to predict (e.g., 'wheat', 'rice').

    Returns:
        A dictionary containing the predicted value.
    """
    
   
        
# --- 3. Pipeline Definition (Your provided structure) ---

NOMINAL_OHE_COLS = ['Scenario', 'BLS Code']
NUMERIC_SCALER_COL = ['CO2 ppm']
# Note: Renamed 'Adaptation' to match the column name in the original data if it was 'Adapt- ation'
ORDINAL_ENCODING_COL = ['Adaptation'] 
BINARY_ENCODING_COL = ['CO2 effects']
TIME_SLICE_COL = ['Time_Slice']

preprocessor = ColumnTransformer(
    transformers=[
        # A. Binary Encoding (for 'CO2 effects')
        ('co2_binary_encode',
         OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'),
         BINARY_ENCODING_COL),

        # B. Ordinal Encoding (for 'Adapt- ation')
        ('adaptation_ord_encode',
         OrdinalEncoder(categories=[['No', 'Level 1', 'Level 2']], handle_unknown='error'),
         ORDINAL_ENCODING_COL),

        # C. One-Hot Encoding (for 'Scenario', 'BLS Code')
        ('nominal_ohe',
         OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
         NOMINAL_OHE_COLS),

        # D. MinMax Scaling (for 'Time_Slice', pre-cleaned)
        ('minmax_scale',
         MinMaxScaler(),
         TIME_SLICE_COL),

        # E. Standard Scaling (for 'CO2 ppm')
        ('co2_scaler',
         StandardScaler(),
         NUMERIC_SCALER_COL)
    ],
    remainder='drop'
)

# The final pipeline structure for the preprocessor
final_preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


# --- 4. Fit the Pipeline and Export to PKL ---

# A. FIT: The pipeline learns all scaling and encoding parameters exclusively from the training data.
final_preprocessing_pipeline.fit(X_train_raw)

# B. EXPORT: Save the single, fitted pipeline object.
pipeline_filename = 'preprocessor.pkl'
joblib.dump(final_preprocessing_pipeline, pipeline_filename)

print("=" * 60)
print(f"SUCCESS: Fitted preprocessor saved to '{pipeline_filename}'.")
print("This file is ready for deployment alongside your six separate model files.")
print("=" * 60)



SUCCESS: Fitted preprocessor saved to 'preprocessor.pkl'.
This file is ready for deployment alongside your six separate model files.


  data['Time_Slice'] = data['Time_Slice'].replace('Equilibrium', 2150.0).astype(float)


In [36]:
# B. EXPORT: Save the single, fitted pipeline object.
pipeline_filename = 'preprocessor.pkl'
joblib.dump(final_preprocessing_pipeline, pipeline_filename)

print("=" * 60)
print(f"SUCCESS: Fitted preprocessor saved to '{pipeline_filename}'.")
print("This file is ready for deployment alongside your six separate model files.")
print("=" * 60)


SUCCESS: Fitted preprocessor saved to 'preprocessor.pkl'.
This file is ready for deployment alongside your six separate model files.


In [37]:
# --- 5. Inspect and Display Processed Data ---

# A. Transform the training data (output is a NumPy array)
X_train_processed = final_preprocessing_pipeline.transform(X_train_raw)

# B. Helper function to clean Scikit-learn prefixes for readability
def clean_pipeline_feature_names(feature_names):
    """Removes the Scikit-learn transformer prefixes (e.g., 'ordinal_encode__')."""
    cleaned_names = []
    for name in feature_names:
        if '__' in name:
            cleaned_names.append(name.split('__', 1)[1])
        else:
            cleaned_names.append(name)
    return cleaned_names

# C. Get feature names and clean them
raw_feature_names = final_preprocessing_pipeline['preprocessor'].get_feature_names_out()
final_feature_names = clean_pipeline_feature_names(raw_feature_names)

# D. Create the final named DataFrame (x_train_final)
x_train_final = pd.DataFrame(X_train_processed, columns=final_feature_names)

print("\n" + "=" * 60)
print("PREPROCESSED X_TRAIN DATA (Features Ready for Model Fitting)")
print("=" * 60)
print(x_train_final.head())
print(f"\nShape of processed features: {x_train_final.shape}")


PREPROCESSED X_TRAIN DATA (Features Ready for Model Fitting)
   CO2 effects_Yes  Adaptation  Scenario_CM2-S550  Scenario_CM2-S750  \
0              0.0         1.0                1.0                0.0   
1              0.0         0.0                0.0                0.0   
2              1.0         0.0                0.0                0.0   
3              1.0         2.0                0.0                0.0   
4              0.0         1.0                1.0                0.0   

   Scenario_CM3-A  Scenario_GFDL  Scenario_GISS  Scenario_UKMO  BLS Code_9  \
0             0.0            0.0            0.0            0.0         0.0   
1             0.0            0.0            0.0            1.0         0.0   
2             0.0            1.0            0.0            0.0         0.0   
3             0.0            0.0            0.0            1.0         0.0   
4             0.0            0.0            0.0            0.0         0.0   

   BLS Code_10  ...  BLS Code_906  B

In [38]:
# 2. Process Test Data (X_test_final)
# IMPORTANT: Use the fitted pipeline's .transform(), NOT .fit_transform()
X_test_processed = final_preprocessing_pipeline.transform(X_test_raw)
x_test = pd.DataFrame(X_test_processed, columns=final_feature_names)

In [39]:
x_test

Unnamed: 0,CO2 effects_Yes,Adaptation,Scenario_CM2-S550,Scenario_CM2-S750,Scenario_CM3-A,Scenario_GFDL,Scenario_GISS,Scenario_UKMO,BLS Code_9,BLS Code_10,...,BLS Code_906,BLS Code_907,BLS Code_908,BLS Code_909,BLS Code_910,BLS Code_911,BLS Code_912,BLS Code_913,Time_Slice,CO2 ppm
0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230769,1.092239
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.461538,-1.062058
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.924486
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.230769,0.068065
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.924486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,-1.062058
214,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,-1.062058
215,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.924486
216,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.461538,-1.062058


In [40]:
joblib.dump(x_train_final, 'x_train.pkl')
joblib.dump(Y_train, 'y_train.pkl')

['y_train.pkl']

In [41]:
joblib.dump(x_test, 'x_test.pkl')
joblib.dump(Y_test, 'y_test.pkl')

['y_test.pkl']

In [46]:
import pandas as pd
import numpy as np
import joblib # Required for saving/loading assets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

# --- 1. DATA LOADING AND PREPARATION (FOR FITTING) ---

# NOTE: This section runs once to prepare data and fit the preprocessor.
try:
    data = pd.read_excel('../data/final.xlsx')
except FileNotFoundError:
    print("WARNING: 'final.xlsx' not found. Skipping fitting phase. Assuming 'preprocessor.pkl' exists.")
    data = pd.DataFrame() # Create empty DataFrame if file is missing, allowing the rest to define assets

# Pre-pipeline cleaning: Replace 'Equilibrium' and ensure float type
if not data.empty:
    data['Time_Slice'] = data['Time_Slice'].replace('Equilibrium', 2150.0).astype(float)

# --- 2. DEFINE FEATURES AND TARGETS ---

TARGET_COLUMNS = ['wheat', 'rice', 'coarse grains', 'protein feed', 'grains', 'four commo-dities']
# Using the correct original feature names:
INPUT_FEATURES = [
    'BLS Code', 'Scenario', 'Time_Slice', 'CO2 effects', 'CO2 ppm', 'Adaptation'
]

# --- 3. PIPELINE DEFINITION ---

NOMINAL_OHE_COLS = ['Scenario', 'BLS Code']
NUMERIC_SCALER_COL = ['CO2 ppm']
# CRITICAL FIX: Use the original column name 'Adapt- ation'
ORDINAL_ENCODING_COL = ['Adaptation'] 
BINARY_ENCODING_COL = ['CO2 effects']
TIME_SLICE_COL = ['Time_Slice']
CATEGORICAL_COLS = ['Scenario', 'CO2 effects', 'Adapt- ation'] 

preprocessor = ColumnTransformer(
    transformers=[
        # A. Binary Encoding (for 'CO2 effects')
        ('co2_binary_encode', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), BINARY_ENCODING_COL),

        # B. Ordinal Encoding (for 'Adapt- ation')
        ('adaptation_ord_encode', OrdinalEncoder(categories=[['No', 'Level 1', 'Level 2']], handle_unknown='error'), ORDINAL_ENCODING_COL),

        # C. One-Hot Encoding (for 'Scenario', 'BLS Code')
        ('nominal_ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), NOMINAL_OHE_COLS),

        # D. MinMax Scaling (for 'Time_Slice')
        ('minmax_scale', MinMaxScaler(), TIME_SLICE_COL),

        # E. Standard Scaling (for 'CO2 ppm')
        ('co2_scaler', StandardScaler(), NUMERIC_SCALER_COL)
    ],
    remainder='drop'
)

# The final pipeline structure for the preprocessor
final_preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


# --- 4. FIT AND EXPORT PREPROCESSOR (IF DATA IS AVAILABLE) ---

if not data.empty:
    X_raw = data[INPUT_FEATURES] 
    Y_raw = data[TARGET_COLUMNS]
    X_train_raw, _, _, _ = train_test_split(X_raw, Y_raw, test_size=0.2, random_state=42)
    
    final_preprocessing_pipeline.fit(X_train_raw)
    pipeline_filename = 'preprocessor.pkl'
    joblib.dump(final_preprocessing_pipeline, pipeline_filename)
    
    print("=" * 60)
    print(f"SUCCESS: Fitted preprocessor saved to '{pipeline_filename}'.")
    print("=" * 60)

# --- 5. DEPLOYMENT ASSET LOADING (RUNS ONCE AT SERVER STARTUP) ---

# Load the single fitted preprocessor pipeline
try:
    PREPROCESSOR = joblib.load('preprocessor.pkl')
except FileNotFoundError:
    print("FATAL: Cannot load 'preprocessor.pkl'. Check file path.")
    PREPROCESSOR = None

# Load all individual model files into a dictionary
MODELS = {}
for col in TARGET_COLUMNS:
    # Model files are saved using underscores (e.g., model_coarse_grains.pkl)
    model_key = col.replace(' ', '_') 
    model_filename = f'model_{model_key}.pkl'
    try:
        MODELS[col.replace(' ', '_')] = joblib.load(model_filename)
    except FileNotFoundError:
        # This is okay if only one model is needed, but warns if a file is missing
        pass 
        # print(f"Warning: Model file '{model_filename}' not found.") 


def predict_single_output(raw_input_data: dict, target_name: str) -> dict:
    """
    Transforms raw input data using the preprocessor and runs prediction 
    using the requested individual model.

    Args:
        raw_input_data: A dictionary containing the raw feature values.
        target_name: The target column to predict (e.g., 'wheat').

    Returns:
        A dictionary containing the predicted value.
    """
    
    # 1. Input Validation and Preparation
    target_key = target_name.replace(' ', '_')
    if target_key not in MODELS:
        return {"error": f"Invalid target key '{target_name}'. Must be one of: {list(MODELS.keys())}"}
    if PREPROCESSOR is None:
        return {"error": "Preprocessing asset is not loaded. Server startup failed."}
    
    # Convert the single raw input dictionary into a DataFrame
    try:
        X_new_raw = pd.DataFrame([raw_input_data])
    except Exception as e:
        return {"error": f"Failed to parse raw input data into DataFrame: {e}"}

    # --- FIX: Enforce correct data types for pipeline compatibility ---
    
    # Enforce float type for scaling components (Time_Slice and CO2 ppm)
    for col in TIME_SLICE_COL + NUMERIC_SCALER_COL:
        if col in X_new_raw.columns:
            try:
                # CRITICAL: Convert to float first!
                X_new_raw[col] = X_new_raw[col].astype(float)
            except Exception:
                return {"error": f"Type Error: Failed to convert numerical column '{col}' to float."}

    # Enforce string/object type for categorical/encoding components
    for col in CATEGORICAL_COLS + ORDINAL_ENCODING_COL:
        if col in X_new_raw.columns:
            # CRITICAL: Convert to string to ensure consistency for encoders
            X_new_raw[col] = X_new_raw[col].astype(str)
        
    # --- END FIX ---

    # 2. Preprocessing
    # The PREPROCESSOR handles all scaling, encoding, and ordering learned during fit.
    try:
        # Only pass the 6 defined INPUT_FEATURES to the transformer
        X_new_processed = PREPROCESSOR.transform(X_new_raw[INPUT_FEATURES])
    except Exception as e:
        # Catch errors during transformation (e.g., unknown category not handled)
        return {"error": f"An error occurred during transformation in the pipeline. Detail: {e}"}
    
    # --- DEBUG LINE ADDED ---
    # Print the first 10 processed features for comparison with the GOLD STANDARD
    print(f"\n[DEBUG] Web Processed Features (First 10): {X_new_processed[0][:10]}")
    # -----------------------
    
    # 3. Model Selection and Prediction
    selected_model = MODELS[target_key]
    
    # Run prediction on the processed features
    raw_prediction = selected_model.predict(X_new_processed)
    
    # 4. Format Output
    predicted_value = raw_prediction[0] # Get the single prediction value
    
    return {
        "target": target_name,
        "predicted_yield_change_percent": round(float(predicted_value), 4)
    }

SUCCESS: Fitted preprocessor saved to 'preprocessor.pkl'.


  data['Time_Slice'] = data['Time_Slice'].replace('Equilibrium', 2150.0).astype(float)
