# 05 - Evaluation and Submission
This notebook loads the test data, applies preprocessing, predicts using the trained model, and generates a Kaggle submission file.

In [25]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [26]:
# Load test data
test_df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/test.csv")
test_ids = test_df['Id']
test_df.drop(columns=['Id'], inplace=True)
test_df.shape

(1459, 79)

In [32]:
# Reload test.csv if needed (to ensure clean state)
test_df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/test.csv")

# Always extract and drop 'Id'
test_ids = test_df['Id']
test_df = test_df.drop(columns=['Id'])

# Sanity check
print(f"Id in test_df: {'Id' in test_df.columns}")
print(f"Shape of test_df: {test_df.shape}")


Id in test_df: False
Shape of test_df: (1459, 79)


In [33]:
# Load preprocessor and model
preprocessor = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")
model = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/xgb_model.pkl")

# Transform and predict
X_test_processed = preprocessor.transform(test_df)
print(f"Transformed test shape: {X_test_processed.shape}")

predictions = model.predict(X_test_processed)


ValueError: columns are missing: {'Id'}

In [27]:
# Use same preprocessing pipeline from training
full_df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/train_cleaned.csv")
X = full_df.drop(columns=[col for col in ['SalePrice', 'Id'] if col in full_df.columns])

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [28]:
# Drop Id from test_df before transforming
if 'Id' in test_df.columns:
    test_ids = test_df['Id']
    test_df = test_df.drop(columns=['Id'])


In [29]:
preprocessor = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")
X_test_processed = preprocessor.transform(test_df)



ValueError: columns are missing: {'Id'}

In [30]:
# Load model and predict
model = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/xgb_model.pkl")
predictions = model.predict(X_test_processed)

ValueError: Feature shape mismatch, expected: 273, got 274

In [31]:
# Create submission DataFrame
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})
submission.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/submission.csv", index=False)
submission.head()

NameError: name 'predictions' is not defined

In [34]:
import pandas as pd

# Reload clean version
test_df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/test.csv")

# Drop Id
test_ids = test_df['Id']
test_df = test_df.drop(columns=['Id'])



In [35]:
import joblib

# Load both
preprocessor = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")
model = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/xgb_model.pkl")


In [36]:
X_test_processed = preprocessor.transform(test_df)
predictions = model.predict(X_test_processed)


ValueError: columns are missing: {'Id'}

In [37]:
import pandas as pd
import joblib

# Load clean test.csv
test_df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/test.csv")

# Extract and drop Id
test_ids = test_df['Id']
test_df = test_df.drop(columns=['Id'])

# Load preprocessor and model
preprocessor = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")
model = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/xgb_model.pkl")

# Transform and predict
X_test_processed = preprocessor.transform(test_df)
predictions = model.predict(X_test_processed)

# Save submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})
submission.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/submission.csv", index=False)
submission.head()


ValueError: Feature shape mismatch, expected: 273, got 272

In [38]:
import pandas as pd
import joblib

# Load and clean test data
test_df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/test.csv")
test_ids = test_df['Id']
test_df = test_df.drop(columns=['Id'])

# Load saved preprocessor + model
preprocessor = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")
model = joblib.load("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/xgb_model.pkl")

# Use transform only (not fit_transform)
X_test_processed = preprocessor.transform(test_df)
predictions = model.predict(X_test_processed)

# Save submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})
submission.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/submission.csv", index=False)
submission.head()


ValueError: Feature shape mismatch, expected: 273, got 272

In [39]:
# Sanity check: load X_train and transform it with preprocessor
X_train = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/train.csv")
X_train = X_train.drop(columns=["Id", "SalePrice"])
X_train_transformed = preprocessor.transform(X_train)

print("X_train_transformed shape:", X_train_transformed.shape)


X_train_transformed shape: (1460, 272)


In [40]:
from xgboost import XGBRegressor
import joblib

# Train on full training data
model = XGBRegressor(random_state=42)
X_train_final = preprocessor.transform(X_train)
y_train = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/train.csv")['SalePrice']

model.fit(X_train_final, y_train)

# Save model and preprocessor again
joblib.dump(model, "/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/xgb_model.pkl")
joblib.dump(preprocessor, "/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl")


['/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/model/preprocessor.pkl']

In [41]:
# Predict
X_test_fixed = preprocessor.transform(X_test)
predictions = model.predict(X_test_fixed)

# Save submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})
submission.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/submission.csv", index=False)
submission.head()


NameError: name 'X_test' is not defined

In [42]:
# Drop Id if present
X_test = test_df.drop(columns=["Id"])

# Transform
X_test_fixed = preprocessor.transform(X_test)

# Predict
predictions = model.predict(X_test_fixed)

# Save submission
submission = pd.DataFrame({
    'Id': test_df['Id'],           # use the original Ids
    'SalePrice': predictions
})
submission.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/submission.csv", index=False)
submission.head()


KeyError: "['Id'] not found in axis"

In [43]:
# Check available columns
print(test_df.columns)


Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [44]:
# Use test_df directly (no need to drop "Id")
X_test_fixed = preprocessor.transform(test_df)

# Predict
predictions = model.predict(X_test_fixed)

# Submission
submission = pd.DataFrame({
    'Id': test_ids,  # assuming you stored test_ids earlier as test_ids = pd.read_csv(...).Id
    'SalePrice': predictions
})
submission.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/submission.csv", index=False)
submission.head()


Unnamed: 0,Id,SalePrice
0,1461,123015.953125
1,1462,153903.359375
2,1463,181372.609375
3,1464,193624.21875
4,1465,179481.765625


In [45]:
test_ids = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/test.csv")["Id"]
