In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from scipy import stats
import optuna
import warnings
warnings.filterwarnings('ignore')

def create_super_advanced_features(df):
    """Advanced feature engineering optimized for CO2 prediction"""
    df = df.copy()
    sensor_cols = ['MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog']
    
    # 1. Sensor-specific weighted features
    df['Sensor_weighted'] = (
        df['MQ135_analog'] * 0.4 +    # MQ135 is most sensitive to CO2
        df['MG811_analog'] * 0.3 +    # MG811 also specifically detects CO2
        df['MQ7_analog'] * 0.15 +     # Less CO2-specific
        df['MQ9_analog'] * 0.15       # Less CO2-specific
    )
    
    # 2. Advanced Statistical Features
    df['Sensor_mean'] = df[sensor_cols].mean(axis=1)
    df['Sensor_std'] = df[sensor_cols].std(axis=1)
    df['Sensor_median'] = df[sensor_cols].median(axis=1)
    df['Sensor_max'] = df[sensor_cols].max(axis=1)
    df['Sensor_min'] = df[sensor_cols].min(axis=1)
    df['Sensor_range'] = df['Sensor_max'] - df['Sensor_min']
    df['Sensor_skew'] = df[sensor_cols].skew(axis=1)
    df['Sensor_kurt'] = df[sensor_cols].kurtosis(axis=1)
    
    # 3. Temperature Compensation
    temp_ref = 25.0
    for col in sensor_cols:
        temp_factor = 1 + 0.02 * (df['Temperature'] - temp_ref)
        df[f'{col}_temp_comp'] = df[col] * temp_factor
    
    # 4. Advanced Ratio Features
    for i, col1 in enumerate(sensor_cols):
        for col2 in sensor_cols[i+1:]:
            df[f'{col1}_{col2}_ratio'] = df[col1] / (df[col2] + 1e-6)
            df[f'{col1}_{col2}_diff'] = df[col1] - df[col2]
            df[f'{col1}_{col2}_product'] = df[col1] * df[col2]
    
    # 5. Environmental Features
    df['Temp_Humid_interaction'] = df['Temperature'] * df['Humidity']
    df['Temp_Humid_ratio'] = df['Temperature'] / (df['Humidity'] + 1e-6)
    
    # 6. Non-linear Transformations
    for col in sensor_cols:
        df[f'{col}_log'] = np.log1p(df[col])
        df[f'{col}_sqrt'] = np.sqrt(df[col])
        df[f'{col}_squared'] = df[col] ** 2
    
    # 7. Rolling Features
    windows = [2, 3, 5]
    for window in windows:
        df[f'weighted_ma_{window}'] = df['Sensor_weighted'].rolling(window, min_periods=1).mean()
        df[f'weighted_std_{window}'] = df['Sensor_weighted'].rolling(window, min_periods=1).std()
        
    return df

def optimize_xgb_params(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'random_state': 42
    }

# Load and prepare data
print("Loading data...")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("SampleSubmission.csv")

# Remove unnecessary columns
columns_to_drop = ['ID', 'device_name']
train_data = train_data.drop(columns=columns_to_drop, errors='ignore')
test_data = test_data.drop(columns=columns_to_drop, errors='ignore')

# Apply advanced feature engineering
print("Applying advanced feature engineering...")
train_data = create_super_advanced_features(train_data)
test_data = create_super_advanced_features(test_data)

# Select features
features = [col for col in train_data.columns if col != 'CO2']

# Prepare data
X = train_data[features].values
y = train_data['CO2'].values

# Advanced scaling
print("Applying advanced scaling...")
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Hyperparameter optimization
def objective(trial):
    params = optimize_xgb_params(trial)
    cv_scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_idx, val_idx in kf.split(X_scaled):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = XGBRegressor(**params)
        model.fit(X_train, y_train,
                 eval_set=[(X_val, y_val)],
                 verbose=False)
        
        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        cv_scores.append(rmse)
    
    return np.mean(cv_scores)

print("Optimizing XGBoost parameters...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("\nBest parameters:", best_params)

# Create multiple XGBoost models with different seeds
models = [
    XGBRegressor(**best_params, random_state=42),
    XGBRegressor(**best_params, random_state=24),
    XGBRegressor(
        **{**best_params,
           'n_estimators': int(best_params['n_estimators'] * 1.2),
           'learning_rate': best_params['learning_rate'] * 0.8
        },
        random_state=100
    )
]

# Cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros((len(X_scaled), len(models)))
test_preds = np.zeros((len(test_data), len(models)))

print("\nStarting cross-validation...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    for i, model in enumerate(models):
        model.fit(X_train, y_train,
                 eval_set=[(X_val, y_val)],
                 verbose=False)
        
        oof_preds[val_idx, i] = model.predict(X_val)
        test_preds[:, i] += model.predict(scaler.transform(test_data[features])) / n_splits
        
        fold_rmse = np.sqrt(mean_squared_error(y_val, oof_preds[val_idx, i]))
        print(f"Model {i+1} RMSE: {fold_rmse:.4f}")

# Calculate optimal weights based on OOF performance
model_rmses = []
for i in range(len(models)):
    rmse = np.sqrt(mean_squared_error(y, oof_preds[:, i]))
    model_rmses.append(rmse)

weights = 1 / np.array(model_rmses)
weights = weights / weights.sum()

# Final weighted prediction
final_predictions = np.average(test_preds, weights=weights, axis=1)

# Create submission file
print("\nCreating submission file...")
sample_submission['CO2'] = final_predictions
sample_submission.to_csv('submission_xgboost_advanced.csv', index=False)

print("\nDone! Check 'submission_xgboost_advanced.csv' for predictions.")

Loading data...
Applying advanced feature engineering...


[I 2025-02-01 00:08:16,829] A new study created in memory with name: no-name-b3e0206d-7269-4465-a8de-a2d7504decb7


Applying advanced scaling...
Optimizing XGBoost parameters...


[I 2025-02-01 00:13:30,353] Trial 0 finished with value: 5.719788349794347 and parameters: {'n_estimators': 1277, 'max_depth': 9, 'learning_rate': 0.07935283764412371, 'subsample': 0.6337628809351206, 'colsample_bytree': 0.9206939514926019, 'min_child_weight': 6, 'gamma': 6.977049276451665e-06, 'reg_alpha': 0.313121159559002, 'reg_lambda': 0.08426785794280317}. Best is trial 0 with value: 5.719788349794347.
[I 2025-02-01 00:18:51,247] Trial 1 finished with value: 5.584741334000084 and parameters: {'n_estimators': 2184, 'max_depth': 8, 'learning_rate': 0.03068285950841623, 'subsample': 0.7579104610561145, 'colsample_bytree': 0.8778307630983447, 'min_child_weight': 6, 'gamma': 4.058192028824901e-06, 'reg_alpha': 2.0080146877576325e-05, 'reg_lambda': 0.000838638146663865}. Best is trial 1 with value: 5.584741334000084.
[I 2025-02-01 00:24:18,945] Trial 2 finished with value: 5.540298501077812 and parameters: {'n_estimators': 1920, 'max_depth': 9, 'learning_rate': 0.07086060438453877, 'sub

KeyboardInterrupt: 