In [None]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin

train = pd.read_csv("data/train.csv")   
test = pd.read_csv("data/test.csv")     

# -------------------------
# Quick sanity: drop any obvious ID column
for id_col in ("Id", "id", "ID"):
    if id_col in train.columns:
        train = train.drop(columns=[id_col])
        if id_col in test.columns:
            test = test.drop(columns=[id_col])

# -------------------------
target_col = "SalePrice"
if target_col not in train.columns:
    raise RuntimeError(f"Expected column '{target_col}' in train dataframe.")

X = train.drop(columns=[target_col])
y = train[target_col].copy()

# small train/validate split for quick evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=123)

# -------------------------
# 3) Choose numeric vs categorical
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# 4) For very high-cardinality categoricals we will use OrdinalEncoder (cheap).
#    For low-cardinality ones, OneHotEncoder.
HIGH_CARDINALITY = 15  # threshold; tweak if you want
low_card_cats = [c for c in cat_cols if X_train[c].nunique() <= HIGH_CARDINALITY]
high_card_cats = [c for c in cat_cols if X_train[c].nunique() > HIGH_CARDINALITY]

# 5) Build preprocessing pipelines
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

low_card_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

high_card_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    # ordinal encoding is fast but encodes categories as integers (not ideal for kNN),
    # but it's a pragmatic speed/size tradeoff for many categories.
    ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_cols),
        ("lowcat", low_card_pipeline, low_card_cats),
        ("highcat", high_card_pipeline, high_card_cats),
    ],
    remainder="drop",
    sparse_threshold=0  # ensure dense output (kNN expects dense arrays)
)

# -------------------------
# 6) Model pipeline (kNN)
knn_pipe = Pipeline([
    ("preproc", preprocessor),
    ("knn", KNeighborsRegressor(n_jobs=-1))
])

# -------------------------
# 7) Hyperparameter grid (small but useful)
param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9],
    "knn__weights": ["uniform", "distance"],
    # p=1 -> Manhattan, p=2 -> Euclidean distance
    "knn__p": [1, 2],
}

# Use negative MSE scoring (sklearn convention); GridSearchCV maximizes score
gs = GridSearchCV(
    knn_pipe,
    param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1,
    verbose=1
)

# -------------------------
# 8) Fit (timing)
t0 = time.time()
gs.fit(X_train, y_train)
t1 = time.time()

print(f"\nGridSearch done in {t1 - t0:.1f}s")
print("Best params:", gs.best_params_)
print("Best CV MSE:", -gs.best_score_)

# -------------------------
# 9) Evaluate on validation set
best_model = gs.best_estimator_
y_val_pred = best_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
print(f"Validation MSE: {val_mse:.4f}")
print(f"Validation RMSE: {np.sqrt(val_mse):.4f}")

# -------------------------
# Refit the pipeline on entire train (X,y) using best params
best_params = gs.best_params_
final_pipe = Pipeline([
    ("preproc", preprocessor),
    ("knn", KNeighborsRegressor(
        n_neighbors=best_params.get("knn__n_neighbors", 5),
        weights=best_params.get("knn__weights", "uniform"),
        p=best_params.get("knn__p", 2),
        n_jobs=-1
    ))
])
final_pipe.fit(X, y)

# predict on `test` (if test doesn't contain SalePrice)
if "SalePrice" in test.columns:
    print("Warning: test already contains SalePrice; predictions will overwrite.")
test_preds = final_pipe.predict(test.drop(columns=[c for c in test.columns if c==target_col]))

# Save predictions
out = test.copy()
out["SalePrice"] = test_preds
out[["SalePrice"]].to_csv("submission.csv", index=True)  # index preserved if you dropped Id earlier
print("Test predictions saved to submission.csv")

# -------------------------
# Quick diagnostics: show few predictions vs actual (if val)
print("\nSample validation predictions (actual vs pred):")
sample = pd.DataFrame({"actual": y_val.values[:8], "pred": y_val_pred[:8]})
print(sample)


Fitting 5 folds for each of 16 candidates, totalling 80 fits

GridSearch done in 2.5s
Best params: {'knn__n_neighbors': 7, 'knn__p': 1, 'knn__weights': 'distance'}
Best CV MSE: 1423451833.1197927
Validation MSE: 1162734308.0177
Validation RMSE: 34098.8901
Test predictions saved to submission.csv

Sample validation predictions (actual vs pred):
   actual           pred
0  222500  202503.359513
1   87000  104033.413024
2  130000  143791.452652
3  274300  236729.687848
4  140000  134383.885507
5  259000  221233.209600
6  309000  286886.087651
7  127500  135037.012670


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# assume still have: X_train, X_val, y_train, y_val, best_model (from GridSearch)
# 1) Baseline: predict mean
mean_pred = np.mean(y_train)
y_val_mean_pred = np.full_like(y_val, mean_pred, dtype=float)
mse_baseline = mean_squared_error(y_val, y_val_mean_pred)
rmse_baseline = np.sqrt(mse_baseline)
print("Baseline (mean) RMSE:", rmse_baseline)
print("Model RMSE:", np.sqrt(mean_squared_error(y_val, best_model.predict(X_val))))

# percent improvement over baseline
model_rmse = np.sqrt(mean_squared_error(y_val, best_model.predict(X_val)))
impr = 100.0 * (1 - model_rmse / rmse_baseline)
print(f"Percent improvement over mean baseline: {impr:.2f}%")

# 2) Percent RMSE relative to median/mean for context
print("RMSE / mean(y):", model_rmse / np.mean(y_train))
print("RMSE / median(y):", model_rmse / np.median(y_train))

Baseline (mean) RMSE: 75741.57901978734
Your model RMSE: 34098.890128825085
Percent improvement over mean baseline: 54.98%
RMSE / mean(y): 0.18939733618543061
RMSE / median(y): 0.21311806330515679
