In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

def create_super_advanced_features(df):
    """Advanced feature engineering with sophisticated transformations"""
    # Copy dataframe to avoid modifications to original
    df = df.copy()
    
    # Basic sensor ratios and interactions
    sensor_cols = ['MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog']
    
    # 1. Advanced Statistical Features
    df['Sensor_mean'] = df[sensor_cols].mean(axis=1)
    df['Sensor_std'] = df[sensor_cols].std(axis=1)
    df['Sensor_median'] = df[sensor_cols].median(axis=1)
    df['Sensor_max'] = df[sensor_cols].max(axis=1)
    df['Sensor_min'] = df[sensor_cols].min(axis=1)
    df['Sensor_range'] = df['Sensor_max'] - df['Sensor_min']
    df['Sensor_skew'] = df[sensor_cols].skew(axis=1)
    df['Sensor_kurt'] = df[sensor_cols].kurtosis(axis=1)
    
    # 2. Advanced Ratio Features
    for i, col1 in enumerate(sensor_cols):
        for col2 in sensor_cols[i+1:]:
            df[f'{col1}_{col2}_ratio'] = df[col1] / (df[col2] + 1e-6)
            df[f'{col1}_{col2}_diff'] = df[col1] - df[col2]
            df[f'{col1}_{col2}_product'] = df[col1] * df[col2]
            df[f'{col1}_{col2}_sum'] = df[col1] + df[col2]
            df[f'{col1}_{col2}_mean'] = (df[col1] + df[col2]) / 2
    
    # 3. Environmental Interaction Features
    df['Temp_Humid_interaction'] = df['Temperature'] * df['Humidity']
    df['Temp_Humid_ratio'] = df['Temperature'] / (df['Humidity'] + 1e-6)
    df['Temp_Humid_sum'] = df['Temperature'] + df['Humidity']
    df['Temp_Humid_diff'] = df['Temperature'] - df['Humidity']
    
    # 4. Polynomial Features
    degrees = [2, 3, 0.5]
    for deg in degrees:
        df[f'Temperature_power_{deg}'] = df['Temperature'] ** deg
        df[f'Humidity_power_{deg}'] = df['Humidity'] ** deg
        df[f'Sensor_mean_power_{deg}'] = df['Sensor_mean'] ** deg
    
    # 5. Advanced Transformations
    for col in sensor_cols:
        df[f'{col}_log'] = np.log1p(df[col])
        df[f'{col}_sqrt'] = np.sqrt(df[col])
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_cubed'] = df[col] ** 3
    
    # 6. Advanced Aggregations
    df['Sensor_geometric_mean'] = stats.gmean(df[sensor_cols] + 1, axis=1)
    df['Sensor_harmonic_mean'] = stats.hmean(df[sensor_cols] + 1, axis=1)
    
    # 7. Advanced Interactions
    df['MQ7_MQ135_temp_ratio'] = df['MQ7_analog'] * df['Temperature'] / (df['MQ135_analog'] + 1e-6)
    df['MQ9_MG811_humid_ratio'] = df['MQ9_analog'] * df['Humidity'] / (df['MG811_analog'] + 1e-6)
    
    # 8. Rolling Features (if temporal nature exists)
    window_sizes = [2, 3, 4]
    for window in window_sizes:
        df[f'rolling_mean_{window}'] = df['Sensor_mean'].rolling(window, min_periods=1).mean()
        df[f'rolling_std_{window}'] = df['Sensor_mean'].rolling(window, min_periods=1).std()
    
    return df

# Load and prepare data
print("Loading data...")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("SampleSubmission.csv")

# Remove unnecessary columns
columns_to_drop = ['ID', 'device_name']
train_data = train_data.drop(columns=columns_to_drop, errors='ignore')
test_data = test_data.drop(columns=columns_to_drop, errors='ignore')

# Apply super advanced feature engineering
print("Applying advanced feature engineering...")
train_data = create_super_advanced_features(train_data)
test_data = create_super_advanced_features(test_data)

# Select features (excluding target)
features = [col for col in train_data.columns if col != 'CO2']

# Prepare data
X = train_data[features].values
y = train_data['CO2'].values

# Advanced scaling
print("Applying advanced scaling...")
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Initialize K-fold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Arrays to store predictions
oof_rf = np.zeros(len(X_scaled))
test_rf = np.zeros(len(test_data))

# Cross-validation loop
print("Starting cross-validation...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Train Random Forest
    rf_model.fit(X_train, y_train)
    oof_rf[val_idx] = rf_model.predict(X_val)
    test_rf += rf_model.predict(scaler.transform(test_data[features])) / n_splits
    
    # Print fold scores
    print(f"Random Forest RMSE: {np.sqrt(mean_squared_error(y_val, oof_rf[val_idx])):.4f}")

# Generate final predictions
final_predictions = test_rf

# Create submission file
print("\nCreating submission file...")
sample_submission['CO2'] = final_predictions
sample_submission.to_csv('submission_random_forest.csv', index=False)

print("\nDone! Check 'submission_random_forest.csv' for predictions.")

Loading data...
Applying advanced feature engineering...
Applying advanced scaling...
Starting cross-validation...

Fold 1/5
Random Forest RMSE: 5.1194

Fold 2/5
Random Forest RMSE: 5.6194

Fold 3/5
Random Forest RMSE: 5.2214

Fold 4/5
Random Forest RMSE: 5.2652

Fold 5/5
Random Forest RMSE: 6.4633

Creating submission file...

Done! Check 'submission_random_forest.csv' for predictions.
