In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, StackingRegressor, VotingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, LassoLarsIC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings("ignore")
from data_utils import load_data, log_target, inv_log_target
from features import FillMissingTransformer, NumericSelector, CatagoricalSelector


# Read Data

In [3]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')


# 2. Target transformation

In [4]:

train["price_log"] = log_target(train["price"])
train.columns


Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus', 'price_log'],
      dtype='object')

#### 3. Feature selection: simple numeric + some categorical as example
#### you can expand this list with domain features

In [5]:

numeric_feats = train.select_dtypes(include=[np.number]).columns.tolist()
categorical_feats = train.select_dtypes(include=['object']).columns.tolist()
numeric_feats.remove('price_log')
numeric_feats.remove('price')


### Custom Pipeline

In [6]:

numeric_pipeline = Pipeline(steps=[
  ('num_selector', NumericSelector(numeric_feats)),
  ('fill_missing', FillMissingTransformer()),
  ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline(steps=[
  ('cat_selector', CatagoricalSelector(categorical_feats)),
  ('imputer', FillMissingTransformer()),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
###  5. ColumnTransformer to Combine
preprocessor = ColumnTransformer(
    transformers=[
      ('numeric', numeric_pipeline, numeric_feats),
      ('categorical', categorical_pipeline, categorical_feats)
    ], remainder='drop')


#### Pipeline using Built in function
# 4. Preprocessors

In [7]:

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent", fill_value="None")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor1 = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_feats),
    ("cat", categorical_transformer, categorical_feats)
], remainder="drop")


# Model


In [8]:
ridge = Ridge(alpha=1.0, random_state=42)
linear = LinearRegression()
lasso = Lasso(alpha=0.01)
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
bayesianridge  = BayesianRidge()
xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    random_state=42,
    n_jobs=1
)

# 6. Pipelines

In [9]:
pipe_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor1),('ridge', ridge)
    ])
pipe_linear = Pipeline(steps=[
  ('preprocessor', preprocessor1),('linear', linear)
])
pipe_lasso = Pipeline(steps = [
  ('preprocessor', preprocessor1),('lasso', lasso)
])
pipe_elastic_net = Pipeline(steps = [
  ('preprocessor', preprocessor1),('elastic_net', elastic_net)
])
pipe_bayesianridge = Pipeline(steps = [
  ('preprocessor', preprocessor1),('bayesianridge', bayesianridge)
])
pipe_xgb = Pipeline(steps=[
  ('preprocessor', preprocessor1),('xgb', xgb)
])

In [10]:
train.head(4)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,price_log
0,7525000,6000,3,2,4,yes,no,no,no,yes,1,no,furnished,15.833742
1,6300000,7200,3,2,1,yes,no,yes,no,yes,3,no,semi-furnished,15.65606
2,3920000,3816,2,1,1,yes,no,yes,no,yes,2,no,furnished,15.181602
3,3430000,2610,3,1,2,yes,no,yes,no,no,0,yes,unfurnished,15.048071


# 7. Cross-validation

In [11]:
X = train.drop(columns=['price','price_log'])
y = train['price_log']

In [12]:
X.head(4)

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,6000,3,2,4,yes,no,no,no,yes,1,no,furnished
1,7200,3,2,1,yes,no,yes,no,yes,3,no,semi-furnished
2,3816,2,1,1,yes,no,yes,no,yes,2,no,furnished
3,2610,3,1,2,yes,no,yes,no,no,0,yes,unfurnished


In [33]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [34]:
def rmse_cv(pipe):
  scores = - cross_val_score(pipe, X, y, scoring="neg_root_mean_squared_error", cv = kf, n_jobs= -1)
  return scores

In [35]:
print("CV Ridge RMSE (log-space):", rmse_cv(pipe_ridge).mean())
print("CV XGB RMSE (log-space):", rmse_cv(pipe_xgb).mean())


CV Ridge RMSE (log-space): 0.19892195652020894
CV XGB RMSE (log-space): 0.22366724443027755


# 8. Fit best model on full training data

In [36]:
pipe_ridge.fit(X,y)
pipe_linear.fit(X,y)
pipe_lasso.fit(X,y)
pipe_elastic_net.fit(X,y)
pipe_bayesianridge.fit(X,y)
pipe_xgb.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,numeric_cols,"['area', 'bedrooms', ...]"

0,1,2
,numeric_fill,
,cat_fill,'None'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categorical_cols,"['mainroad', 'guestroom', ...]"

0,1,2
,numeric_fill,
,cat_fill,'None'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


# 9. Save model & preprocessor

In [37]:
joblib.dump(pipe_xgb,"../models/xgb_pipeline.joblib" )
print("Saved model to ../models/xgb_pipeline.joblib")

Saved model to ../models/xgb_pipeline.joblib


In [38]:
model = joblib.load("../models/xgb_pipeline.joblib")

In [39]:
def predict(df):
  preds_log = model.predict(df)
  preds = inv_log_target(preds_log)
  return preds

In [40]:
if __name__ == "__main__":
  test = pd.read_csv('../data/test.csv')
  test_x, test_y = test.drop(columns=['price']), test['price']
  preds = predict(test_x)
  test_ids = range(len(test_x))
  out = pd.DataFrame({'Id': test_ids, 'PredictedPrice': preds[:]})
  out.to_csv('../data/submission.csv', index=False)
  for i in range(10):
    print(f"Predicted price: {preds[i]:.2f}, Actual price: {test_y.iloc[i]:.2f}")
  

Predicted price: 4935677.00, Actual price: 4060000.00
Predicted price: 7126563.00, Actual price: 6650000.00
Predicted price: 3734623.00, Actual price: 3710000.00
Predicted price: 4640749.50, Actual price: 6440000.00
Predicted price: 4019621.00, Actual price: 2800000.00
Predicted price: 2982923.75, Actual price: 4900000.00
Predicted price: 6321063.00, Actual price: 5250000.00
Predicted price: 4632184.50, Actual price: 4543000.00
Predicted price: 2569151.50, Actual price: 2450000.00
Predicted price: 2964606.00, Actual price: 3353000.00


In [41]:
out = pd.read_csv('../data/submission.csv', index_col=False)
out.head(10)

Unnamed: 0,Id,PredictedPrice
0,0,4935677.0
1,1,7126563.0
2,2,3734623.0
3,3,4640749.5
4,4,4019621.0
5,5,2982923.8
6,6,6321063.0
7,7,4632184.5
8,8,2569151.5
9,9,2964606.0


In [42]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test = pd.read_csv('../data/test.csv')
test_x, y_true = test.drop(columns=['price']), test['price']
y_pred = inv_log_target(model.predict(test_x))
rmse = mean_squared_error(y_true, y_pred)


In [43]:
print(f"Test RMSE: {rmse:.2f}")

Test RMSE: 1974086664192.00


In [44]:
X.shape, y.shape

((436, 12), (436,))

### 9 — Hyperparameter tuning (example: RandomizedSearchCV for XGB)

In [45]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

#kf = KFold(n_splits=5, shuffle=True, random_state=42)



param_grid = {
    'model__n_estimators': [100, 200, 500, 800, 1000, 1500],
    'model__max_depth': [2, 4, 6, 8, 10, 12],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'model__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

rs = RandomizedSearchCV(
    estimator=pipe_xgb,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

rs.fit(X, y)

print("Best parameters found:", rs.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found: {'model__subsample': 1.0, 'model__n_estimators': 500, 'model__max_depth': 4, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.7}


In [54]:
# Required imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer



# Basic pipeline: scaler + model (we'll swap model in param grid)
pipe = Pipeline([
    ('preprocessor',preprocessor1),    # helps Ridge; harmless for trees
    ('model', Ridge())               # placeholder; will be replaced in GridSearch param_grid
])

# Parameter grid as a LIST of dicts: one dict per estimator type
param_grid = [
    # ---- Ridge options ----
    {
        'model': [Ridge()],
        'model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
        'model__solver': ['auto']   # optional
    },

    # ---- XGBoost options ----
    {
        # supply an XGBRegressor instance (any hard-coded defaults you want)
        'model': [XGBRegressor(tree_method='hist', random_state=42, n_jobs=-1, verbosity=0)],
        'model__n_estimators': [100, 300, 600],
        'model__max_depth': [3, 4, 6],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0]
    }
]

# Scorer: you can use built-in 'neg_root_mean_squared_error' or create one
# Note: GridSearch expects higher-is-better scoring, so RMSE must be negated or use scorer that returns negative
scoring = 'neg_root_mean_squared_error'  # available in recent sklearn versions

# Cross-validation split object
cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV: does exhaustive search over the param grid
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv_inner,
    scoring=scoring,
    n_jobs=-1,
    verbose=2,
    refit=True  # refit best estimator on the whole training set
)

# Fit on training set only
grid.fit(X, y)

# Best results
print("Best params:", grid.best_params_)
print("Best CV score (neg RMSE):", grid.best_score_)

# Evaluate the best model on held-out test set
best_model = grid.best_estimator_
y_pred = best_model.predict(test_x)
rmse_test = mean_squared_error(y_true, y_pred)
print("Test RMSE:", rmse_test)


Fitting 5 folds for each of 248 candidates, totalling 1240 fits
Best params: {'model': Ridge(), 'model__alpha': 10.0, 'model__solver': 'auto'}
Best CV score (neg RMSE): -0.19874054163476979
Test RMSE: 30129839401535.246


### 10 — Deployment (Streamlit quick UI)

In [53]:
import streamlit as st
import pandas as pd
import joblib
from data_utils import inv_log_target

st.title("House Price Predictor")

model = joblib.load("../models/xgb_pipeline.joblib")

st.write("Enter features as CSV row or upload a CSV file with same columns as training.")

uploaded = st.file_uploader("Upload CSV", type=["csv"])
if uploaded:
    df = pd.read_csv(uploaded)
    preds = inv_log_target(model.predict(df))
    st.write("Predictions")
    st.write(pd.DataFrame({"Prediction": preds}))
else:
    st.info("Upload CSV to predict.")


2025-10-26 19:31:59.275 
  command:

    streamlit run d:\Machine Learning Project\House Price Prediction\venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
