In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression, Ridge
from sklearn.inspection import permutation_importance

# 1. Load and preprocess training data
df = pd.read_csv('train.csv')
df.drop(columns=['id', 'Row#'], inplace=True, errors='ignore')

# Remove outliers
q1, q3 = df["output"].quantile([0.25, 0.75])
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
df = df[(df["output"] >= lower) & (df["output"] <= upper)]

X = df.drop(columns=['output'])
y = df['output']

# Scaling
scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# Split once for permutation importance
X_train_full, X_val_full, y_train_full, y_val_full = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 2. Define stacking model
estimators = [
    ('lr', LinearRegression()),
    ('ridge', Ridge()),
    ('bayes', BayesianRidge()),
    ('rf', RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42)),
    ('gb', GradientBoostingRegressor()),
    ('xgb', XGBRegressor(verbosity=0)),
    ('lgb', LGBMRegressor(verbose=-1, n_estimators=100, learning_rate=0.05)),
]

stack_model_full = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(),
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# Train for importance computation
stack_model_full.fit(X_train_full, y_train_full)

# 3. Compute permutation importance
perm_result = permutation_importance(
    stack_model_full, X_val_full, y_val_full, n_repeats=10, random_state=42, scoring='r2'
)
perm_importance_stack = pd.Series(perm_result.importances_mean, index=X.columns).sort_values(ascending=False)
top_13_features = perm_importance_stack.head(13).index.tolist()

print("\n📊 Top 13 Important Features:")
print(top_13_features)

# 4. Cross-validation using top 13 features
X_13 = X_scaled[top_13_features]
cv_results = cross_validate(
    stack_model_full, X_13, y, cv=5,
    scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'],
    return_train_score=False, n_jobs=-1
)

print("\n🔍 Cross-Validation (Top 13 Features):")
print(f"    R² (mean):  {cv_results['test_r2'].mean():.4f}")
print(f"    MAE (mean): {-cv_results['test_neg_mean_absolute_error'].mean():.4f}")
print(f"    MSE (mean): {-cv_results['test_neg_mean_squared_error'].mean():.4f}")

# 5. Retrain on full training set using top 13 features
final_stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(),
    passthrough=False,
    cv=5,
    n_jobs=-1
)
final_stack_model.fit(X_13, y)

# 6. Load and preprocess test data
test_df = pd.read_csv('test.csv')
test_ids = test_df['id']
test_df.drop(columns=['id', 'Row#'], inplace=True, errors='ignore')

# Apply same scaling
X_test_scaled = pd.DataFrame(scaler.transform(test_df), columns=X.columns)

# Select top 13 features
X_test_final = X_test_scaled[top_13_features]

# 7. Predict and save submission
y_pred_test = final_stack_model.predict(X_test_final)

submission_df = pd.DataFrame({
    'id': test_ids,
    'yield': y_pred_test
})
submission_df.to_csv('submission.csv', index=False)

print("\n✅ Submission file 'submission.csv' created successfully using top 13 features.")



📊 Top 13 Important Features:
['fruitset', 'seeds', 'fruitmass', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'bumbles', 'RainingDays', 'honeybee', 'AverageRainingDays']

🔍 Cross-Validation (Top 13 Features):
    R² (mean):  0.5211
    MAE (mean): 252.8474
    MSE (mean): 835869.3068

✅ Submission file 'submission.csv' created successfully using top 13 features.
