down one is main

In [None]:
# ==============================================================================
# Step 0: Install Required Libraries
# ==============================================================================
print("Step 0: Installing required libraries...")
!pip install category_encoders -q
!pip install catboost -q
# Pinning to a known, stable version to ensure API consistency
!pip install xgboost==1.7.6 -q

# ==============================================================================
# Step 1: Library Imports and Configuration
# ==============================================================================
print("\nStep 1: Importing libraries and setting up configuration...")
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
import gc
import warnings
warnings.filterwarnings('ignore')

# --- Configuration Block ---
class CFG:
    N_SPLITS = 10
    RANDOM_STATE = 42
    TARGET_COL = 'efficiency'
    ID_COL = 'id'
    
    CAT_COLS = ['string_id', 'error_code', 'installation_type']
    # Define only the *original* numeric columns here for cleaning
    ORIGINAL_NUM_COLS = ['irradiance', 'temperature', 'voltage', 'current', 'humidity', 
                         'wind_speed', 'pressure', 'soiling_ratio', 'module_temperature']

    LGBM_PARAMS = {
        'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 10000, 
        'learning_rate': 0.01, 'num_leaves': 32, 'max_depth': 7, 'seed': RANDOM_STATE, 
        'n_jobs': -1, 'verbose': -1, 'colsample_bytree': 0.7, 'subsample': 0.7, 
        'reg_alpha': 0.1, 'reg_lambda': 0.1
    }
    CAT_PARAMS = {
        'iterations': 10000, 'learning_rate': 0.02, 'depth': 8, 'loss_function': 'RMSE', 
        'eval_metric': 'RMSE', 'random_seed': RANDOM_STATE, 'verbose': 0, 
        'early_stopping_rounds': 200, 'l2_leaf_reg': 3
    }
    XGB_PARAMS = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_estimators': 10000,
        'learning_rate': 0.01, 'max_depth': 6, 'subsample': 0.7,
        'colsample_bytree': 0.7, 'random_state': RANDOM_STATE, 'n_jobs': -1,
        'tree_method': 'hist', 'enable_categorical': True
    }
    ET_PARAMS = {
        'n_estimators': 500, 'max_depth': 15, 'min_samples_leaf': 5,
        'random_state': RANDOM_STATE, 'n_jobs': -1, 'max_features': 0.8
    }

# ==============================================================================
# Step 2: Load Data
# ==============================================================================
try:
    print("\nStep 2: Loading data...")
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print("Data loaded successfully!")
    n_train = len(train_df)
    y = train_df[CFG.TARGET_COL].copy()
    test_ids = test_df[CFG.ID_COL].copy()
    # Combine feature dataframes, keeping ID for now
    full_df = pd.concat([train_df.drop(CFG.TARGET_COL, axis=1), test_df], axis=0).reset_index(drop=True)
except FileNotFoundError:
    print("FATAL ERROR: Files not found. Please restart the runtime and re-upload your CSV files.")
    exit()

# ==============================================================================
# Step 3: Feature Engineering
# ==============================================================================
def create_smarter_features(df):
    print("\nStep 3: Creating smarter, more robust features...")
    data = df.drop(CFG.ID_COL, axis=1).copy()
    
    # --- Robust Imputation on original numeric columns ---
    for col in CFG.ORIGINAL_NUM_COLS:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        median_val = data[col].median()
        data[col] = data[col].fillna(median_val)

    # --- Basic Interaction & Physics Features ---
    data['power'] = data['voltage'] * data['current']
    data['temp_diff'] = data['module_temperature'] - data['temperature']
    
    # --- Group-By Aggregate Features (CORRECTED FOR NEW PANDAS) ---
    AGG_COLS = ['irradiance', 'temperature', 'power', 'temp_diff']
    for cat_col in CFG.CAT_COLS:
        for agg_col in AGG_COLS:
            # Step 1: Aggregate using a list of functions
            agg_funcs = ['mean', 'std', 'max']
            agg_stats = data.groupby(cat_col)[agg_col].agg(agg_funcs)
            
            # Step 2: Create new column names and assign them
            agg_stats.columns = [f'{cat_col}_{agg_col}_{func}' for func in agg_funcs]
            
            data = data.merge(agg_stats, on=cat_col, how='left')

    # --- Deviation from Group Mean Features ---
    for cat_col in CFG.CAT_COLS:
        for agg_col in AGG_COLS:
            # Make sure the mean column exists before trying to use it
            mean_col_name = f'{cat_col}_{agg_col}_mean'
            if mean_col_name in data.columns:
                data[f'{agg_col}_vs_{cat_col}_mean'] = data[agg_col] - data[mean_col_name]

    # --- Polynomial Features ---
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
    poly_features = ['irradiance', 'temperature', 'power', 'temp_diff']
    poly_df = pd.DataFrame(poly.fit_transform(data[poly_features]), columns=poly.get_feature_names_out(poly_features))
    # Sanitize column names from polynomial features
    poly_df.columns = [f'poly_{c.replace(" ", "_").replace("*", "_x_")}' for c in poly_df.columns]
    
    data = pd.concat([data.reset_index(drop=True), poly_df.reset_index(drop=True)], axis=1)

    data.fillna(-999, inplace=True)
    return data

# ==============================================================================
# Step 4: Final Data Preparation for Models
# ==============================================================================
print("\nStep 4: Preparing data for different models...")

# --- Identify ALL columns that are not explicitly categorical ---
all_cols = full_df.columns.tolist()
final_numeric_cols = [col for col in all_cols if col not in CFG.CAT_COLS]

# --- Create data for CatBoost/XGBoost ---
X_cat = full_df.copy()

# 1. Explicitly convert categorical columns to string type for CatBoost's processing
for col in CFG.CAT_COLS:
    X_cat[col] = X_cat[col].astype(str)

# 2. Forcibly convert all other columns to numeric, coercing errors.
for col in final_numeric_cols:
    X_cat[col] = pd.to_numeric(X_cat[col], errors='coerce').fillna(-999)

# 3. For CatBoost, it's safest to keep the categorical columns as strings/objects.
#    For XGBoost with enable_categorical=True, it prefers the 'category' dtype. Let's make a small tweak.
X_xgb = X_cat.copy()
for col in CFG.CAT_COLS:
    X_xgb[col] = X_xgb[col].astype('category')

# --- Create data for LightGBM/ExtraTrees (requires integer encoding) ---
X_lgb_et = full_df.copy()

# 1. Label encode the known categorical columns
for col in CFG.CAT_COLS:
    le = LabelEncoder()
    X_lgb_et[col] = le.fit_transform(X_lgb_et[col].astype(str))

# 2. Forcibly convert all other columns to numeric
for col in final_numeric_cols:
     X_lgb_et[col] = pd.to_numeric(X_lgb_et[col], errors='coerce').fillna(-999)

# Separate into Training and Test Sets
X_lgb_train, X_lgb_test = X_lgb_et.iloc[:n_train], X_lgb_et.iloc[n_train:]
X_cat_train, X_cat_test = X_cat.iloc[:n_train], X_cat.iloc[n_train:]
X_xgb_train, X_xgb_test = X_xgb.iloc[:n_train], X_xgb.iloc[n_train:]

print(f"Feature shapes: X_lgb_train={X_lgb_train.shape}, X_cat_train={X_cat_train.shape}, X_xgb_train={X_xgb_train.shape}")
print("\nData type check for CatBoost training data:")
print(X_cat_train.dtypes.value_counts())

# ==============================================================================
# Step 5: Level 0 Model Training (LGBM, CatBoost, XGBoost, ExtraTrees)
# ==============================================================================
print("\nStep 5: Training Level 0 Base Models...")
kf = KFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.RANDOM_STATE)

oof = np.zeros((n_train, 4))
test_preds = np.zeros((len(test_df), 4))

# Model 1: LightGBM
print("\n--- Training LightGBM ---")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_lgb_train, y)):
    model = lgb.LGBMRegressor(**CFG.LGBM_PARAMS)
    model.fit(X_lgb_train.iloc[train_idx], y.iloc[train_idx], 
              eval_set=[(X_lgb_train.iloc[val_idx], y.iloc[val_idx])], 
              callbacks=[lgb.early_stopping(200, verbose=False)])
    oof[val_idx, 0] = model.predict(X_lgb_train.iloc[val_idx])
    test_preds[:, 0] += model.predict(X_lgb_test) / CFG.N_SPLITS
gc.collect()

# Model 2: CatBoost
print("\n--- Training CatBoost ---")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_cat_train, y)):
    model = CatBoostRegressor(**CFG.CAT_PARAMS)
    model.fit(X_cat_train.iloc[train_idx], y.iloc[train_idx], 
              eval_set=(X_cat_train.iloc[val_idx], y.iloc[val_idx]), 
              cat_features=CFG.CAT_COLS, use_best_model=True)
    oof[val_idx, 1] = model.predict(X_cat_train.iloc[val_idx])
    test_preds[:, 1] += model.predict(X_cat_test) / CFG.N_SPLITS
gc.collect()

# Model 3: XGBoost
print("\n--- Training XGBoost ---")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_xgb_train, y)):
    model = xgb.XGBRegressor(**CFG.XGB_PARAMS)
    model.fit(X_xgb_train.iloc[train_idx], y.iloc[train_idx], 
              eval_set=[(X_xgb_train.iloc[val_idx], y.iloc[val_idx])], 
              early_stopping_rounds=200, verbose=False)
    oof[val_idx, 2] = model.predict(X_xgb_train.iloc[val_idx])
    test_preds[:, 2] += model.predict(X_xgb_test) / CFG.N_SPLITS
gc.collect()

# Model 4: ExtraTrees
print("\n--- Training ExtraTrees ---")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_lgb_train, y)):
    model = ExtraTreesRegressor(**CFG.ET_PARAMS)
    model.fit(X_lgb_train.iloc[train_idx], y.iloc[train_idx])
    oof[val_idx, 3] = model.predict(X_lgb_train.iloc[val_idx])
    test_preds[:, 3] += model.predict(X_lgb_test) / CFG.N_SPLITS
gc.collect()

# ==============================================================================
# Step 6: Train Level 1 Meta-Model (Ridge) and Evaluate
# ==============================================================================
print("\nStep 6: Evaluating Base Models and Training Ridge Meta-Model...")
model_names = ['lgb', 'cat', 'xgb', 'et']
for i, name in enumerate(model_names):
    rmse = np.sqrt(mean_squared_error(y, oof[:, i]))
    print(f"{name.upper()} OOF RMSE: {rmse:.6f}")

stack_X_train = pd.DataFrame(oof, columns=model_names)
stack_X_test = pd.DataFrame(test_preds, columns=model_names)

meta_model = Ridge(alpha=1.0, random_state=CFG.RANDOM_STATE)
meta_model.fit(stack_X_train, y)

final_predictions = meta_model.predict(stack_X_test)
meta_oof_preds = meta_model.predict(stack_X_train)
stack_oof_rmse = np.sqrt(mean_squared_error(y, meta_oof_preds))
stack_score = 100 * (1 - stack_oof_rmse)

print(f"\nFinal Stacked OOF RMSE: {stack_oof_rmse:.6f}")
print(f"Final Expected Competition Score: {stack_score:.6f}")
coeffs_str = ", ".join([f"{name.upper()}={coef:.4f}" for name, coef in zip(model_names, meta_model.coef_)])
print(f"Meta-Model (Ridge) Coefficients: {coeffs_str}")

# ==============================================================================
# Step 7: Create Final Submission with Clipping
# ==============================================================================
print("\nStep 7: Creating final submission file with clipping...")
submission_df = pd.DataFrame({CFG.ID_COL: test_ids, CFG.TARGET_COL: final_predictions})
min_efficiency = y.min()
max_efficiency = y.max()
submission_df[CFG.TARGET_COL] = submission_df[CFG.TARGET_COL].clip(min_efficiency, max_efficiency)

submission_df.to_csv('submission_robust_ensemble.csv', index=False)
print("\nRobust ensemble submission file 'submission_robust_ensemble.csv' created successfully!")

Step 0: Installing required libraries...

Step 1: Importing libraries and setting up configuration...

Step 2: Loading data...
Data loaded successfully!

Step 4: Preparing data for different models...
Feature shapes: X_lgb_train=(20000, 16), X_cat_train=(20000, 16), X_xgb_train=(20000, 16)

Data type check for CatBoost training data:
float64    12
object      3
int64       1
Name: count, dtype: int64

Step 5: Training Level 0 Base Models...

--- Training LightGBM ---

--- Training CatBoost ---

--- Training XGBoost ---

--- Training ExtraTrees ---

Step 6: Evaluating Base Models and Training Ridge Meta-Model...
LGB OOF RMSE: 0.103585
CAT OOF RMSE: 0.102761
XGB OOF RMSE: 0.104053
ET OOF RMSE: 0.108968

Final Stacked OOF RMSE: 0.102489
Final Expected Competition Score: 89.751126
Meta-Model (Ridge) Coefficients: LGB=0.5794, CAT=0.4444, XGB=0.0013, ET=-0.0071

Step 7: Creating final submission file with clipping...

Robust ensemble submission file 'submission_robust_ensemble.csv' created s

this is the best of mine 