In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from packaging import version
import optuna
import joblib


In [None]:


#  Load Data --------
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [None]:

X_train = train_df.drop(columns=['efficiency'])
y_train = train_df['efficiency']
X_test = test_df.copy()


In [None]:

# - Feature Engineering with Robust Handling --------
class SolarFeatureEngineer(BaseEstimator, TransformerMixin):
    def _init_(self):
        self.numeric_cols_ = ['voltage', 'current', 'irradiance', 'temperature', 'wind_speed', 
                              'soiling_ratio', 'module_temperature', 'panel_age', 'maintenance_count',
                              'humidity', 'pressure', 'cloud_coverage']
        self.imputer_ = SimpleImputer(strategy='median')
    
    def fit(self, X, y=None):
        X_numeric = X[self.numeric_cols_].copy()
    
        for col in self.numeric_cols_:
            X_numeric[col] = pd.to_numeric(X_numeric[col], errors='coerce')
            if X_numeric[col].isna().any():
                print(f"Warning: Non-numeric values or NaNs found in {col}, converted to NaN for imputation")
    
        self.imputer_.fit(X_numeric)
        return self
    
    def transform(self, X):
        X = X.copy()
      
        X_numeric = X[self.numeric_cols_].copy()
        for col in self.numeric_cols_:
            X_numeric[col] = pd.to_numeric(X_numeric[col], errors='coerce')
        X[self.numeric_cols_] = self.imputer_.transform(X_numeric)
        
        # Optimized feature creation
        X['power_output'] = X['voltage'] * X['current']
        X['irradiance_temp_ratio'] = X['irradiance'] / (X['temperature'] + 1e-6)
        X['efficiency_loss_due_to_soiling'] = X['soiling_ratio'] * X['irradiance']
        X['temp_diff'] = X['module_temperature'] - X['temperature']
        X['age_per_maintenance'] = X['panel_age'] / (X['maintenance_count'] + 1)
        X['is_soiled'] = (X['soiling_ratio'] > 0.5).astype(int)
        
        return X


In [None]:

# -- Preprocessing Pipeline --------
numeric_cols = ['voltage', 'current', 'irradiance', 'temperature', 'wind_speed', 
                'soiling_ratio', 'module_temperature', 'panel_age', 'maintenance_count',
                'humidity', 'pressure', 'cloud_coverage', 'power_output', 
                'irradiance_temp_ratio', 'efficiency_loss_due_to_soiling', 
                'temp_diff', 'age_per_maintenance', 'is_soiled']
categorical_cols = ['string_id', 'error_code', 'installation_type']

if version.parse(sklearn._version_) >= version.parse("1.2"):
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
else:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=True)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', ohe)
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, categorical_cols)
], sparse_threshold=0.8)


In [None]:

# Apply preprocessing
fe = SolarFeatureEngineer()
X_train_fe = fe.fit_transform(X_train)
X_test_fe = fe.transform(X_test)

top_feats = ['irradiance', 'temperature', 'soiling_ratio', 'module_temperature']
if X_train_fe[top_feats].isna().any().any():
    raise ValueError("NaN values found in top_feats after feature engineering")

X_train_processed = full_pipeline.fit_transform(X_train_fe)
X_test_processed = full_pipeline.transform(X_test_fe)


if hasattr(X_train_processed, 'toarray'):
    X_train_processed = X_train_processed.toarray()
    X_test_processed = X_test_processed.toarray()


In [None]:

# - Polynomial Features --------
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly_train = poly.fit_transform(X_train_fe[top_feats])
X_poly_test = poly.transform(X_test_fe[top_feats])

X_train_combined = np.hstack([X_train_processed, X_poly_train])
X_test_combined = np.hstack([X_test_processed, X_poly_test])


In [None]:

# --Efficient Feature Selection --------
# Dynamically set max_features based on available features
n_features = X_train_combined.shape[1]
max_features = min(50, n_features)  # Ensure max_features <= n_features
if max_features < 10:
    print(f"Warning: Only {n_features} features available, using all features")
    X_train_selected = X_train_combined
    X_test_selected = X_test_combined
else:
    lgb_selector = LGBMRegressor(n_estimators=100, random_state=42, device="gpu")
    selector = SelectFromModel(lgb_selector, max_features=max_features)
    selector.fit(X_train_combined, y_train)
    X_train_selected = selector.transform(X_train_combined)
    X_test_selected = selector.transform(X_test_combined)


In [None]:

#  Optuna Hyperparameter Tuning --------
def objective(trial):
    params = {
        'xgb_n_estimators': trial.suggest_int('xgb_n_estimators', 100, 500),  
        'xgb_learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.3, log=True),
        'lgb_n_estimators': trial.suggest_int('lgb_n_estimators', 100, 500),
        'lgb_learning_rate': trial.suggest_float('lgb_learning_rate', 0.01, 0.3, log=True),
        'cat_iterations': trial.suggest_int('cat_iterations', 100, 500),
        'cat_learning_rate': trial.suggest_float('cat_learning_rate', 0.01, 0.3, log=True),
        'ridge_alpha': trial.suggest_float('ridge_alpha', 0.1, 10.0)
    }
    
    model = StackingRegressor(
        estimators=[
            ('xgb', XGBRegressor(n_estimators=params['xgb_n_estimators'], 
                                 learning_rate=params['xgb_learning_rate'], 
                                 random_state=42, device="cuda")),
            ('lgb', LGBMRegressor(n_estimators=params['lgb_n_estimators'], 
                                  learning_rate=params['lgb_learning_rate'], 
                                  random_state=42, device="gpu")),
            ('cat', CatBoostRegressor(iterations=params['cat_iterations'], 
                                      learning_rate=params['cat_learning_rate'], 
                                      task_type="GPU", verbose=0, random_state=42))
        ],
        final_estimator=Ridge(alpha=params['ridge_alpha']),
        passthrough=True,
        n_jobs=1 
    )
    
    model.fit(X_train_selected, y_train)
    preds = model.predict(X_train_selected)
    rmse = np.sqrt(np.mean((y_train - preds) ** 2))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  


In [None]:

# Final Model --------
best_params = study.best_params
final_model = StackingRegressor(
    estimators=[
        ('xgb', XGBRegressor(n_estimators=best_params['xgb_n_estimators'], 
                             learning_rate=best_params['xgb_learning_rate'], 
                             random_state=42, device="cuda")),
        ('lgb', LGBMRegressor(n_estimators=best_params['lgb_n_estimators'], 
                              learning_rate=best_params['lgb_learning_rate'], 
                              random_state=42, device="gpu")),
        ('cat', CatBoostRegressor(iterations=best_params['cat_iterations'], 
                                  learning_rate=best_params['cat_learning_rate'], 
                                  task_type="GPU", verbose=0, random_state=42))
    ],
    final_estimator=Ridge(alpha=best_params['ridge_alpha']),
    passthrough=True,
    n_jobs=1
)
final_model.fit(X_train_selected, y_train)


In [None]:

#  Predict and Save 
y_pred_test = final_model.predict(X_test_selected)
output_df = pd.DataFrame({
    'id': test_df['id'],
    'efficiency': y_pred_test
})


In [None]:

# Validate submission format
if len(output_df) != 12000:
    raise ValueError(f"Submission file has {len(output_df)} rows, expected 12000")
if not output_df.columns.tolist() == ['id', 'efficiency']:
    raise ValueError("Submission file must have columns ['id', 'efficiency']")

output_df.to_csv("gpu_optimized_predictions_v3.csv", index=False)


In [None]:

# Save the model
joblib.dump(final_model, 'gpu_solar_efficiency_model_v3.pkl')
print("GPU-optimized pipeline complete. Predictions saved to gpu_optimized_predictions_v3.csv")