In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.base import clone
from scipy import stats
import optuna
import warnings
warnings.filterwarnings('ignore')

# Advanced Feature Engineering
def create_super_advanced_features(df):
    """Advanced feature engineering with sophisticated transformations"""
    df = df.copy()
    sensor_cols = ['MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog']
    
    # 1. Advanced Statistical Features
    df['Sensor_mean'] = df[sensor_cols].mean(axis=1)
    df['Sensor_std'] = df[sensor_cols].std(axis=1)
    df['Sensor_median'] = df[sensor_cols].median(axis=1)
    df['Sensor_max'] = df[sensor_cols].max(axis=1)
    df['Sensor_min'] = df[sensor_cols].min(axis=1)
    df['Sensor_range'] = df['Sensor_max'] - df['Sensor_min']
    df['Sensor_skew'] = df[sensor_cols].skew(axis=1)
    df['Sensor_kurt'] = df[sensor_cols].kurtosis(axis=1)
    
    # 2. Advanced Ratio Features
    for i, col1 in enumerate(sensor_cols):
        for col2 in sensor_cols[i+1:]:
            df[f'{col1}_{col2}_ratio'] = df[col1] / (df[col2] + 1e-6)
            df[f'{col1}_{col2}_diff'] = df[col1] - df[col2]
            df[f'{col1}_{col2}_product'] = df[col1] * df[col2]
            df[f'{col1}_{col2}_sum'] = df[col1] + df[col2]
            df[f'{col1}_{col2}_mean'] = (df[col1] + df[col2]) / 2
    
    # 3. Environmental Interaction Features
    df['Temp_Humid_interaction'] = df['Temperature'] * df['Humidity']
    df['Temp_Humid_ratio'] = df['Temperature'] / (df['Humidity'] + 1e-6)
    df['Temp_Humid_sum'] = df['Temperature'] + df['Humidity']
    df['Temp_Humid_diff'] = df['Temperature'] - df['Humidity']
    
    # 4. Polynomial Features
    degrees = [2, 3, 0.5]
    for deg in degrees:
        df[f'Temperature_power_{deg}'] = df['Temperature'] ** deg
        df[f'Humidity_power_{deg}'] = df['Humidity'] ** deg
        df[f'Sensor_mean_power_{deg}'] = df['Sensor_mean'] ** deg
    
    # 5. Advanced Transformations
    for col in sensor_cols:
        df[f'{col}_log'] = np.log1p(df[col])
        df[f'{col}_sqrt'] = np.sqrt(df[col])
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_cubed'] = df[col] ** 3
    
    # 6. Advanced Aggregations
    df['Sensor_geometric_mean'] = stats.gmean(df[sensor_cols] + 1, axis=1)
    df['Sensor_harmonic_mean'] = stats.hmean(df[sensor_cols] + 1, axis=1)
    
    # 7. Advanced Interactions
    df['MQ7_MQ135_temp_ratio'] = df['MQ7_analog'] * df['Temperature'] / (df['MQ135_analog'] + 1e-6)
    df['MQ9_MG811_humid_ratio'] = df['MQ9_analog'] * df['Humidity'] / (df['MG811_analog'] + 1e-6)
    
    # 8. Rolling Features (if temporal nature exists)
    window_sizes = [2, 3, 4]
    for window in window_sizes:
        df[f'rolling_mean_{window}'] = df['Sensor_mean'].rolling(window, min_periods=1).mean()
        df[f'rolling_std_{window}'] = df['Sensor_mean'].rolling(window, min_periods=1).std()
    
    return df

# Hyperparameter Optimization with Optuna
def optimize_hyperparameters(X, y):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),  # Only valid values
            'random_state': 42
        }
        model = RandomForestRegressor(**params)
        scores = []
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            scores.append(np.sqrt(mean_squared_error(y_val, y_pred)))
        return np.mean(scores)
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=25)
    return study.best_params

# Load and prepare data
print("Loading data...")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("SampleSubmission.csv")

# Remove unnecessary columns
columns_to_drop = ['ID', 'device_name']
train_data = train_data.drop(columns=columns_to_drop, errors='ignore')
test_data = test_data.drop(columns=columns_to_drop, errors='ignore')

# Apply super advanced feature engineering
print("Applying advanced feature engineering...")
train_data = create_super_advanced_features(train_data)
test_data = create_super_advanced_features(test_data)

# Select features (excluding target)
features = [col for col in train_data.columns if col != 'CO2']

# Prepare data
X = train_data[features].values
y = train_data['CO2'].values

# Advanced scaling
print("Applying advanced scaling...")
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Hyperparameter Optimization
print("Optimizing hyperparameters...")
best_params = optimize_hyperparameters(X_scaled, y)

# Initialize models
models = {
    'RandomForest': RandomForestRegressor(**best_params),
    'XGBoost': XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42),
    'MLP': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
}

# Initialize K-fold
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store predictions
oof_predictions = {model_name: np.zeros(len(X_scaled)) for model_name in models}
test_predictions = {model_name: np.zeros(len(test_data)) for model_name in models}

# Cross-validation loop
print("Starting cross-validation...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled, pd.cut(y, bins=10, labels=False))):
    print(f"\nFold {fold + 1}/{n_splits}")
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        oof_predictions[model_name][val_idx] = model.predict(X_val)
        test_predictions[model_name] += model.predict(scaler.transform(test_data[features])) / n_splits
        
        # Print fold scores
        print(f"{model_name} RMSE: {np.sqrt(mean_squared_error(y_val, oof_predictions[model_name][val_idx])):.4f}")

# Ensemble predictions
final_predictions = np.mean([test_predictions[model_name] for model_name in models], axis=0)

# Create submission file
print("\nCreating submission file...")
sample_submission['CO2'] = final_predictions
sample_submission.to_csv('submission_ensemble.csv', index=False)

print("\nDone! Check 'submission_ensemble.csv' for predictions.")

Loading data...
Applying advanced feature engineering...
Applying advanced scaling...


[I 2025-01-31 20:13:16,810] A new study created in memory with name: no-name-bae09b50-e62a-465c-97fa-110a07438e5e


Optimizing hyperparameters...


[I 2025-01-31 20:50:52,072] Trial 0 finished with value: 9.753959433505662 and parameters: {'n_estimators': 921, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None}. Best is trial 0 with value: 9.753959433505662.
[I 2025-01-31 20:52:31,061] Trial 1 finished with value: 11.510232572529542 and parameters: {'n_estimators': 511, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 9.753959433505662.
[I 2025-01-31 20:52:59,679] Trial 2 finished with value: 7.44303317517569 and parameters: {'n_estimators': 111, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 2 with value: 7.44303317517569.
[I 2025-01-31 20:59:12,373] Trial 3 finished with value: 7.878989630512512 and parameters: {'n_estimators': 799, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 2 with value: 7.44303317517569.
[I 2025-01

KeyboardInterrupt: 