In [1]:
import os
import time
import joblib
import warnings
import numpy as np
import pandas as pd
import featuretools as ft
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import catboost as cb
import optuna

# Suppress warnings for clean output
warnings.filterwarnings("ignore")

# Record start time
start_time = time.time()
print(f"üöÄ Starting pipeline at {time.ctime()}")

# ======= File paths =======
model_path             = "D:\Python\Models\gpu_lightgbm_model.pkl"
scaler_path            = "D:\Python\Models\scaler.pkl"
selected_features_path = "D:\Python\Models\selected_features.pkl"
poly_path              = "D:\Python\Models\poly.pkl"
feature_defs_path      = "D:\Python\Models\Feature_defs.pkl"
X_columns_path         = "D:\Python\Models\X_columns.pkl"
imputer_path           = "D:\Python\Models\imputer.pkl"  # Added imputer path
output_dir = os.path.dirname(model_path)
os.makedirs(output_dir, exist_ok=True)
print(f"üìÅ Ensured output directory exists: {output_dir}")

# ======= Phase 1: Load Dataset =======
data_path     = "D:\Python\Datasets\Handled_Training_Data.csv"
target_column = 'target'

try:
    data = pd.read_csv(data_path)
    print(f"‚úÖ Dataset loaded! Shape: {data.shape}")
except Exception as e:
    print("‚ùå Error loading dataset:", e)
    exit(1)

# Remove rows with missing target
try:
    data = data.loc[data[target_column].notna()]
    print("‚úÖ Removed rows with missing target values.")
except Exception as e:
    print("‚ùå Error removing missing target rows:", e)
    exit(1)

# ======= Phase 2: Train-Test Split =======
try:
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    print(f"‚úÖ Train-Test split: {X_train.shape} train, {X_test.shape} test")
except Exception as e:
    print("‚ùå Error during train-test split:", e)
    exit(1)

# ======= Phase 3: Encode Categorical Variables =======
print("üß† Phase 3: Encoding categorical variables...")
try:
    cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    if cat_cols:
        ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
        X_train_cat = pd.DataFrame(
            ohe.fit_transform(X_train[cat_cols]),
            columns=ohe.get_feature_names_out(cat_cols),
            index=X_train.index
        )
        X_train = pd.concat([X_train.drop(columns=cat_cols), X_train_cat], axis=1)
        joblib.dump((X_train.columns.tolist(), ohe), X_columns_path)
        print(f"‚úÖ Encoded categoricals and saved encoder to {X_columns_path}")

        X_test_cat = pd.DataFrame(
            ohe.transform(X_test[cat_cols]),
            columns=ohe.get_feature_names_out(cat_cols),
            index=X_test.index
        )
        X_test = pd.concat([X_test.drop(columns=cat_cols), X_test_cat], axis=1)
    else:
        print("‚úÖ No categorical columns to encode.")
except Exception as e:
    print("‚ùå Error encoding categoricals:", e)
    exit(1)

# ======= Phase 4: Remove Outliers =======
print("üßπ Phase 4: Removing outliers from training data...")
try:
    num_cols = X_train.select_dtypes(include=np.number).columns
    iso = IsolationForest(random_state=42, contamination=0.1)
    mask = iso.fit_predict(X_train[num_cols]) == 1
    X_train, y_train = X_train[mask], y_train[mask]
    print(f"‚úÖ Outlier removal done. Training shape now: {X_train.shape}")
except Exception as e:
    print("‚ùå Error during outlier removal:", e)
    exit(1)

# ======= Phase 5: Generate Polynomial Features =======
print("üõ†Ô∏è Phase 5: Generating polynomial features on training data...")
try:
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = pd.DataFrame(
        poly.fit_transform(X_train),
        columns=poly.get_feature_names_out(X_train.columns),
        index=X_train.index
    )
    joblib.dump(poly, poly_path)
    print(f"‚úÖ PolynomialFeatures saved to {poly_path}")
except Exception as e:
    print("‚ùå Error generating polynomial features:", e)
    exit(1)

# ======= Phase 6: Feature Engineering with Featuretools =======
print("üß† Phase 6: Featuretools engineering on training data...")
try:
    Xf = X_train.copy()
    Xf['idx'] = Xf.index
    es = ft.EntitySet(id="train_dataset")
    es.add_dataframe(dataframe_name="df", dataframe=Xf, index="idx")
    fm_train, fdefs = ft.dfs(
        entityset=es,
        target_dataframe_name="df",
        max_depth=1,
        verbose=False
    )
    new_feats = [c for c in fm_train.columns if c not in X_train.columns and c != 'idx']
    X_train_ft = fm_train[new_feats].reset_index(drop=True)
    joblib.dump(fdefs, feature_defs_path)
    print(f"‚úÖ {len(new_feats)} Featuretools features saved to {feature_defs_path}")
except Exception as e:
    print("‚ùå Error in Featuretools phase:", e)
    exit(1)

# ======= Phase 7: Combine Features =======
print("üîó Phase 7: Combining polynomial + engineered features...")
try:
    X_train_poly = X_train_poly.reset_index(drop=True)
    X_train_ft   = X_train_ft.reset_index(drop=True)
    X_train_final = pd.concat([X_train_poly, X_train_ft], axis=1)
    print(f"‚úÖ Combined training matrix shape: {X_train_final.shape}")
except Exception as e:
    print("‚ùå Error combining features:", e)
    exit(1)

# ======= Phase 8: Impute Missing Values =======
print("üß© Phase 8: Imputing missing values in training data...")
try:
    if X_train_final.isnull().any().any():
        imputer = SimpleImputer(strategy="mean")
        X_train_final = pd.DataFrame(
            imputer.fit_transform(X_train_final),
            columns=X_train_final.columns
        )
        joblib.dump(imputer, imputer_path)  # Save imputer
        print(f"‚úÖ Missing values imputed and saved to {imputer_path}")
    else:
        print("‚úÖ No missing values detected.")
except Exception as e:
    print("‚ùå Error during imputation:", e)
    exit(1)

# ======= Phase 9: Scale Training Features =======
print("‚öôÔ∏è Phase 9: Scaling training features...")
try:
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_final)
    joblib.dump(scaler, scaler_path)
    print(f"‚úÖ Features scaled and scaler saved to {scaler_path}")
except Exception as e:
    print("‚ùå Error during scaling:", e)
    exit(1)

# ======= Phase 10: Feature Selection =======
print("üîç Phase 10: Feature selection using CatBoost on training data...")
try:
    selector_model = cb.CatBoostRegressor(
        verbose=0, task_type="GPU", devices='0', random_state=42
    )
    selector_model.fit(X_train_scaled, y_train)
    selector = SelectFromModel(selector_model, threshold="median", prefit=True)
    X_train_selected = selector.transform(X_train_scaled)
    selected_features = list(np.array(X_train_final.columns)[selector.get_support()])
    joblib.dump(selected_features, selected_features_path)
    print(f"‚úÖ Feature selection completed and saved to {selected_features_path}")
except Exception as e:
    print("‚ùå Error during feature selection:", e)
    exit(1)

# ======= Phase 11: Hyperparameter Tuning with Optuna =======
print("üéØ Phase 11: Hyperparameter tuning with Optuna...")
try:
    X_tune, X_val, y_tune, y_val = train_test_split(
        X_train_selected, y_train, test_size=0.3, random_state=42
    )

    def objective(trial):
        params = {
            "num_leaves": trial.suggest_int("num_leaves", 20, 150),
            "max_depth": trial.suggest_int("max_depth", 5, 15),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        }
        model = lgb.LGBMRegressor(
            **params, random_state=42,
            device='gpu', gpu_platform_id=0, gpu_device_id=0
        )
        model.fit(
            X_tune, y_tune,
            eval_set=[(X_val, y_val)],
            eval_metric='mae',
            callbacks=[early_stopping(30), log_evaluation(0)]
        )
        return mean_absolute_error(y_val, model.predict(X_val))

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30, timeout=1200)
    best_params = study.best_params
    print("‚úÖ Hyperparameter tuning completed. Best params:", best_params)
except Exception as e:
    print("‚ùå Error during hyperparameter tuning:", e)
    exit(1)

# ======= Phase 12: Final Model Training =======
print("üèÅ Phase 12: Training final LightGBM model...")
try:
    final_model = lgb.LGBMRegressor(
        **best_params, random_state=42,
        device='gpu', gpu_platform_id=0, gpu_device_id=0
    )
    final_model.fit(X_train_selected, y_train)
    print("‚úÖ Final model training completed.")
except Exception as e:
    print("‚ùå Error during final model training:", e)
    exit(1)

# ======= Phase 13: Process Test Data =======
print("üîÑ Phase 13: Processing test data...")
try:
    # (a) Polynomial features
    X_test_poly = pd.DataFrame(
        poly.transform(X_test),
        columns=poly.get_feature_names_out(X_test.columns),
        index=X_test.index
    )

    # (b) Featuretools on test data
    Xf_test = X_test.copy()
    Xf_test['idx'] = Xf_test.index
    es_test = ft.EntitySet(id="test_dataset")
    es_test.add_dataframe(dataframe_name="df", dataframe=Xf_test, index="idx")
    fm_test = ft.calculate_feature_matrix(fdefs, entityset=es_test)
    new_feats_test = [c for c in fm_test.columns if c not in X_test.columns and c != 'idx']
    X_test_ft = fm_test[new_feats_test].reset_index(drop=True)

    # (c) Combine
    X_test_poly = X_test_poly.reset_index(drop=True)
    X_test_final = pd.concat([X_test_poly, X_test_ft], axis=1)

    # (d) Impute using TRAINING imputer
    try:
        imputer = joblib.load(imputer_path)  # Load imputer
        if X_test_final.isnull().any().any():
            X_test_final = pd.DataFrame(
                imputer.transform(X_test_final),  # No fit here
                columns=X_test_final.columns
            )
            print("‚úÖ Test missing values imputed using training imputer.")
    except FileNotFoundError:
        print("‚ùå Imputer not found. Skipping imputation for test data.")

    # (e) Scale
    X_test_scaled = scaler.transform(X_test_final)

    # (f) Select features
    X_test_selected = selector.transform(X_test_scaled)
    print(f"‚úÖ Test data processed. Shape: {X_test_selected.shape}")
except Exception as e:
    print("‚ùå Error processing test data:", e)
    exit(1)

# ======= Phase 14: Evaluate on Test Data =======
print("üìä Phase 14: Evaluating model on test data...")
try:
    y_pred = final_model.predict(X_test_selected)
    mae = mean_absolute_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)
    print(f"‚úÖ Test MAE: {mae:.4f}, R2: {r2:.4f}")
except Exception as e:
    print("‚ùå Error during model evaluation:", e)
    exit(1)

# ======= Phase 15: Save Model =======
print("üíæ Phase 15: Saving final model...")
try:
    joblib.dump(final_model, model_path)
    print(f"‚úÖ Final model saved to {model_path}")
except Exception as e:
    print("‚ùå Error saving final model:", e)

print(f"üéâ Pipeline completed in {time.time() - start_time:.2f} seconds.")

  model_path             = "D:\Python\Models\gpu_lightgbm_model.pkl"
  scaler_path            = "D:\Python\Models\scaler.pkl"
  selected_features_path = "D:\Python\Models\selected_features.pkl"
  poly_path              = "D:\Python\Models\poly.pkl"
  feature_defs_path      = "D:\Python\Models\Feature_defs.pkl"
  X_columns_path         = "D:\Python\Models\X_columns.pkl"
  imputer_path           = "D:\Python\Models\imputer.pkl"  # Added imputer path
  data_path     = "D:\Python\Datasets\Handled_Training_Data.csv"
  model_path             = "D:\Python\Models\gpu_lightgbm_model.pkl"
  scaler_path            = "D:\Python\Models\scaler.pkl"
  selected_features_path = "D:\Python\Models\selected_features.pkl"
  poly_path              = "D:\Python\Models\poly.pkl"
  feature_defs_path      = "D:\Python\Models\Feature_defs.pkl"
  X_columns_path         = "D:\Python\Models\X_columns.pkl"
  imputer_path           = "D:\Python\Models\imputer.pkl"  # Added imputer path
  data_path     = "D:\Python\Da

ModuleNotFoundError: No module named 'lightgbm'