In [None]:
# 📦 Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from lightgbm import LGBMRegressor
import optuna
import warnings
warnings.filterwarnings('ignore')

# 📂 Load Data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("SampleSubmission.csv")

# 🔧 Enhanced Feature Engineering
def create_features(df):
    # Original features
    df['MQ7_MQ9_ratio'] = df['MQ7_analog'] / df['MQ9_analog']
    df['MQ7_MG811_ratio'] = df['MQ7_analog'] / df['MG811_analog']
    df['MQ9_MG811_ratio'] = df['MQ9_analog'] / df['MG811_analog']
    df['Temp_Humid_interaction'] = df['Temperature'] * df['Humidity']
    
    # Advanced features
    df['MQ135_Temperature_ratio'] = df['MQ135_analog'] / df['Temperature']
    df['Sensor_mean'] = df[['MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog']].mean(axis=1)
    df['Sensor_std'] = df[['MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog']].std(axis=1)
    
    # Polynomial features
    df['Temperature_sq'] = df['Temperature'] ** 2
    df['Humidity_sq'] = df['Humidity'] ** 2
    
    # Log transforms
    df['MG811_log'] = np.log1p(df['MG811_analog'])
    
    return df

# Apply feature engineering
train_data = create_features(train_data)
test_data = create_features(test_data)

# Define features
features = ['Temperature', 'Humidity', 'MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog',
           'MQ7_MQ9_ratio', 'MQ7_MG811_ratio', 'MQ9_MG811_ratio', 'Temp_Humid_interaction',
           'MQ135_Temperature_ratio', 'Sensor_mean', 'Sensor_std',
           'Temperature_sq', 'Humidity_sq', 'MG811_log']

# Prepare data
X = train_data[features]
y = train_data['CO2']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
    }
    
    model = LGBMRegressor(**params, random_state=42)
    
    # K-fold cross validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_scaled, y, scoring='neg_root_mean_squared_error', cv=kf)
    
    return -scores.mean()

# Optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get best parameters
best_params = study.best_params

# Train final model with best parameters
lgb_model = LGBMRegressor(**best_params, random_state=42)
lgb_model.fit(X_train, y_train)

# Make predictions
lgb_pred = lgb_model.predict(X_val)

# Evaluate model
print("Optimized LightGBM RMSE:", np.sqrt(mean_squared_error(y_val, lgb_pred)))

# Prepare test data
X_test = test_data[features]
X_test_scaled = scaler.transform(X_test)

# Generate test predictions
test_predictions = lgb_model.predict(X_test_scaled)

# Create submission file
sample_submission['CO2'] = test_predictions
sample_submission.to_csv('submission_optimized_lgb.csv', index=False)

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': lgb_model.feature_importances_
})

# Visualize feature importance
feature_importance = feature_importance.sort_values('importance', ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

# Print top features
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))