In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import joblib

# Enable IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Drop rows with missing target values
train_df.dropna(subset=['output_electricity_generation'], inplace=True)

# Feature Engineering
def feature_engineering(df):
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['pressure_temp_interaction'] = df['vapour_pressure'] * df['vapour_temperature']
    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# Identify categorical and numerical columns
categorical_cols = ['day']
numerical_cols = [col for col in train_df.columns if col not in ['uid', 'output_electricity_generation'] + categorical_cols]

# Data Preprocessing
numerical_pipeline = Pipeline([
    ('imputer', IterativeImputer(random_state=42)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Prepare data
X = train_df.drop(columns=['uid', 'output_electricity_generation'])
y = train_df['output_electricity_generation']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

# Train models
xgb_model = xgb.XGBRegressor(n_estimators=300, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)

lgb_model = lgb.LGBMRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(50)], verbose=False)

cb_model = cb.CatBoostRegressor(n_estimators=300, learning_rate=0.1, depth=5, verbose=0)
cb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50)

rf_model = RandomForestRegressor(n_estimators=200, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

# Stacking model
stacking_model = StackingRegressor(
    estimators=[('xgb', xgb_model), ('lgb', lgb_model), ('cb', cb_model), ('rf', rf_model)],
    final_estimator=Ridge()
)
stacking_model.fit(X_train, y_train)

# Evaluate model
y_pred = stacking_model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
mae = mean_absolute_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)
print(f"Stacking RMSE: {rmse}")
print(f"Stacking MAE: {mae}")
print(f"Stacking R² Score: {r2}")

# Save model
joblib.dump(stacking_model, 'stacking_model.pkl')

# Predict on test data
test_df_preprocessed = preprocessor.transform(test_df.drop(columns=['uid']))
final_preds = stacking_model.predict(test_df_preprocessed)

# Save predictions
output_df = pd.DataFrame({'uid': test_df['uid'], 'output_electricity_generation': final_preds})
output_df.to_csv("final_predictions.csv", index=False)
print("Predictions saved to final_predictions.csv")