In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression, Ridge
from xgboost import XGBRegressor
# from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor

df = pd.read_csv('train.csv')
df.drop(columns=['id', 'Row#'], inplace=True, errors='ignore')


In [39]:
print(df.shape)
print(df.columns)
df.head()

(8999, 17)
Index(['clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia',
       'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange',
       'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange',
       'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds',
       'output'],
      dtype='object')


Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,output
0,12.5,0.25,0.25,0.25,0.5,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.421449,0.403129,31.394569,4952.01304
1,25.0,0.5,0.25,0.5,0.75,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.586603,0.49002,40.282376,7532.82953
2,12.5,0.25,0.25,0.5,0.75,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.483671,0.411591,34.781055,5715.0084
3,25.0,0.5,0.25,0.5,0.75,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.429001,0.398864,31.119881,4575.76991
4,12.5,0.25,0.25,0.5,0.75,77.4,46.8,64.7,55.8,27.0,45.8,1.0,0.1,0.546136,0.475965,39.096884,6801.32393


In [2]:
q1, q3 = df["output"].quantile([0.25, 0.75])
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
df = df[(df["output"] >= lower) & (df["output"] <= upper)]
X = df.drop(columns=['output'])
y = df['output']

# Scale with RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
results = []
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(verbosity=0),
    "CatBoost": CatBoostRegressor(verbose=0),
    "LightGBM": LGBMRegressor(verbose=-1),
}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test if not isinstance(model, LGBMRegressor) else pd.DataFrame(X_test, columns=X.columns))


    results.append({
        'Model': name,
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    })

# ================================
# 📊 4. Display Results
# ================================
results_df = pd.DataFrame(results).sort_values(by='R2', ascending=False)
print(results_df)

              Model         MAE            MSE        R2
1  GradientBoosting  240.105867  117451.539991  0.931529
0      RandomForest  238.681904  117902.617410  0.931266
4          LightGBM  242.020048  120320.012675  0.929857
3          CatBoost  241.091811  120512.657323  0.929744
2           XGBoost  253.456152  128113.822228  0.925313


In [None]:

from sklearn.linear_model import ElasticNet, Lasso

estimators = [
    ('lr', LinearRegression()),
    ('ridge', Ridge()),
    # ('lasso',Lasso(alpha=0.1)),
    # ('elastic', ElasticNet(alpha=0.1, l1_ratio=0.5)),
    ('bayes', BayesianRidge()),
    ('rf', RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42)),
    ('gb', GradientBoostingRegressor()),
    ('xgb', XGBRegressor(verbosity=0)),
    # ('cat', CatBoostRegressor(verbose=0)),
    ('lgb', LGBMRegressor(verbose=-1, n_estimators=200, learning_rate=0.05)),
]
stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(),
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# Fit on training data
stack_model.fit(X_train, y_train)

# Predict
y_pred_stack = stack_model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred_stack)
mse = mean_squared_error(y_test, y_pred_stack)
r2 = r2_score(y_test, y_pred_stack)*100

print("\n🔗 Stacking Ensemble Performance:")
print(f"    MAE : {mae:.4f}")
print(f"    MSE : {mse:.4f}")
print(f"    R²  : {r2:.4f}")

# results = []
# for i in [50, 100, 150, 200, 250, 300]:
#     print(f"Training with estimators={i}")
#     results.append({
#         'estimators': i,
#         'R2': train(i)
    # })


🔗 Stacking Ensemble Performance:
    MAE : 236.5261
    MSE : 115640.0073
    R²  : 93.2585


In [7]:
# Load test data
test_df = pd.read_csv('test.csv')

# Keep the ID for submission
test_ids = test_df['id']
# test_df['fruitset_x_seeds'] = df['fruitset'] * df['seeds']
# Drop unnecessary columns
test_df.drop(columns=['id', 'Row#'], inplace=True, errors='ignore')

# Apply the same scaling as training data
X_test_final = scaler.transform(test_df)

# Predict using trained model
y_pred_test = stack_model.predict(X_test_final)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_ids,
    'yield': y_pred_test
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)

print("✅ Submission file 'submission.csv' created successfully.")


✅ Submission file 'submission.csv' created successfully.


