In [10]:
import joblib
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [5]:
# Ignore InconsistentVersionWarning
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

In [6]:
# Load models
gb_model = joblib.load('../models/best_GradientBoosting.pkl')
xgb_model = joblib.load('../models/best_XGBoost.pkl')

In [7]:
# Load processed data
X_train_selected = joblib.load('../data/processed/X_train_selected.pkl')
y_train = joblib.load('../data/processed/y_train.pkl')
y_train_original = np.expm1(y_train)

In [8]:
# Evaluate GradientBoosting
gb_cv_scores = cross_val_score(gb_model, X_train_selected, y_train, cv=5, scoring='neg_root_mean_squared_error')
gb_rmse_scores = -gb_cv_scores
gb_mean_rmse = gb_rmse_scores.mean()
print(f"GradientBoosting Mean RMSE: {gb_mean_rmse:.2f}")

GradientBoosting Mean RMSE: 0.12


In [9]:
# Evaluate XGBoost
xgb_cv_scores = cross_val_score(xgb_model, X_train_selected, y_train, cv=5, scoring='neg_root_mean_squared_error')
xgb_rmse_scores = -xgb_cv_scores
xgb_mean_rmse = xgb_rmse_scores.mean()
print(f"XGBoost Mean RMSE: {xgb_mean_rmse:.2f}")

XGBoost Mean RMSE: 0.13


In [11]:
best_model = gb_model

In [12]:
X_test_selected = joblib.load('../data/processed/X_test_selected.pkl')

best_model.fit(X_train_selected, y_train)

# Predict on test data (in log scale)
y_pred_log = best_model.predict(X_test_selected)

# Convert predictions back to original scale
y_pred = np.expm1(y_pred_log)

# Load test IDs for submission
test_data = pd.read_csv('../data/test.csv')
submission_df = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': y_pred
})

# Save submission file
submission_df.to_csv('../data/processed/sample_submission.csv', index=False)
print("Submission file saved as 'sample_submission.csv'")

Submission file saved as 'sample_submission.csv'
