In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

# Advanced Feature Engineering
def create_advanced_features(df):
    # Original features
    df['MQ7_MQ9_ratio'] = df['MQ7_analog'] / df['MQ9_analog']
    df['MQ7_MG811_ratio'] = df['MQ7_analog'] / df['MG811_analog']
    df['MQ9_MG811_ratio'] = df['MQ9_analog'] / df['MG811_analog']
    
    # Statistical features
    sensor_cols = ['MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog']
    df['Sensor_mean'] = df[sensor_cols].mean(axis=1)
    df['Sensor_std'] = df[sensor_cols].std(axis=1)
    df['Sensor_max'] = df[sensor_cols].max(axis=1)
    df['Sensor_min'] = df[sensor_cols].min(axis=1)
    df['Sensor_range'] = df['Sensor_max'] - df['Sensor_min']
    
    # Interaction features
    df['Temp_Humid_interaction'] = df['Temperature'] * df['Humidity']
    df['MQ135_Temp_ratio'] = df['MQ135_analog'] / df['Temperature']
    df['MQ135_Humid_ratio'] = df['MQ135_analog'] / df['Humidity']
    
    # Polynomial features
    df['Temperature_sq'] = df['Temperature'] ** 2
    df['Humidity_sq'] = df['Humidity'] ** 2
    
    # Log transforms
    df['MG811_log'] = np.log1p(df['MG811_analog'])
    df['MQ135_log'] = np.log1p(df['MQ135_analog'])
    
    # Aggregated features
    df['Sensor_geometric_mean'] = np.exp(np.log1p(df[sensor_cols]).mean(axis=1))
    
    return df

# Load and prepare data
print("Loading data...")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("SampleSubmission.csv")

# Apply feature engineering
print("Applying feature engineering...")
train_data = create_advanced_features(train_data)
test_data = create_advanced_features(test_data)

# Define features
features = ['Temperature', 'Humidity', 'MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog',
           'MQ7_MQ9_ratio', 'MQ7_MG811_ratio', 'MQ9_MG811_ratio', 'Sensor_mean', 'Sensor_std',
           'Sensor_max', 'Sensor_min', 'Sensor_range', 'Temp_Humid_interaction',
           'MQ135_Temp_ratio', 'MQ135_Humid_ratio', 'Temperature_sq', 'Humidity_sq',
           'MG811_log', 'MQ135_log', 'Sensor_geometric_mean']

# Prepare data
X = train_data[features]
y = train_data['CO2']

# Scale features
print("Scaling features...")
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Define models with optimized parameters
xgb_params = {
    'n_estimators': 1000,
    'max_depth': 6,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'random_state': 42
}

lgb_params = {
    'n_estimators': 1000,
    'max_depth': 6,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'random_state': 42
}

# Initialize models
xgb_model = xgb.XGBRegressor(**xgb_params)
lgb_model = LGBMRegressor(**lgb_params)

# K-fold cross-validation
print("Performing cross-validation...")
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store predictions
oof_xgb = np.zeros(len(X_scaled))
oof_lgb = np.zeros(len(X_scaled))
test_xgb = np.zeros(len(test_data))
test_lgb = np.zeros(len(test_data))

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    X_train, X_val = X_scaled.iloc[train_idx], X_scaled.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Train XGBoost
    xgb_model.fit(X_train, y_train)
    oof_xgb[val_idx] = xgb_model.predict(X_val)
    test_xgb += xgb_model.predict(scaler.transform(test_data[features])) / n_splits
    
    # Train LightGBM
    lgb_model.fit(X_train, y_train)
    oof_lgb[val_idx] = lgb_model.predict(X_val)
    test_lgb += lgb_model.predict(scaler.transform(test_data[features])) / n_splits
    
    # Print fold scores
    print(f"XGBoost RMSE: {np.sqrt(mean_squared_error(y_val, oof_xgb[val_idx])):.4f}")
    print(f"LightGBM RMSE: {np.sqrt(mean_squared_error(y_val, oof_lgb[val_idx])):.4f}")

# Calculate optimal weights based on OOF predictions
from sklearn.linear_model import Ridge
meta_X = np.column_stack([oof_xgb, oof_lgb])
meta_model = Ridge(alpha=1.0)
meta_model.fit(meta_X, y)

weights = meta_model.coef_
weights = weights / np.sum(weights)  # Normalize weights
print(f"\nOptimal weights: XGBoost={weights[0]:.3f}, LightGBM={weights[1]:.3f}")

# Generate final predictions
test_predictions = weights[0] * test_xgb + weights[1] * test_lgb

# Create submission file
print("\nCreating submission file...")
sample_submission['CO2'] = test_predictions
sample_submission.to_csv('submission_weighted_ensemble.csv', index=False)

# Feature importance analysis
print("\nAnalyzing feature importance...")
feature_importance = pd.DataFrame({
    'feature': features,
    'xgb_importance': xgb_model.feature_importances_,
    'lgb_importance': lgb_model.feature_importances_
})

# Calculate weighted importance
feature_importance['weighted_importance'] = (
    weights[0] * feature_importance['xgb_importance'] + 
    weights[1] * feature_importance['lgb_importance']
)
feature_importance = feature_importance.sort_values('weighted_importance', ascending=False)

# Print top features
print("\nTop 10 Most Important Features:")
print(feature_importance[['feature', 'weighted_importance']].head(10))

print("\nDone! Check 'submission_weighted_ensemble.csv' for predictions.")

Loading data...
Applying feature engineering...
Scaling features...
Performing cross-validation...

Fold 1/5


  File "c:\Users\USER\Documents\Contest\Zindi-Air-Prediction\env\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\USER\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1440, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5610
[LightGBM] [Info] Number of data points in the train set: 5845, number of used features: 22
[LightGBM] [Info] Start training from score 611.776359
XGBoost RMSE: 5.5809
LightGBM RMSE: 6.0737

Fold 2/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5610
[LightGBM] [Info] Number of data points in the train set: 5845, number of used features: 22
[LightGBM] [Info] Start training from score 611.590669
XGBoost RMSE: 5.6990
LightGBM RMSE: 6.1968

Fold 3/5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5610
[LightGBM] [