In [1]:
# importing the required library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv('../data/gurgaon_properties_missing_value_treated.csv')

In [3]:
df.head()

Unnamed: 0,property_type,society,sector,price,price_per_sqft,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score
0,flat,dlf the skycourt,sector 86,1.58,8191.0,3,3,1,16.0,Relatively New,1750.0,0,0,0,1,0,2,152
1,flat,ss the leaf,sector 85,1.2,7317.0,2,2,3,12.0,Relatively New,1484.0,0,0,0,0,0,2,157
2,house,ansals florence villa,sector 57,6.0,22222.0,4,5,2,2.0,Old Property,2700.0,0,1,0,1,0,2,20
3,flat,vatika the seven lamps,sector 82,0.86,6022.0,2,2,2,12.0,Relatively New,1294.0,1,0,0,0,0,2,135
4,house,independent,sector 7,0.45,5000.0,3,2,1,2.0,Old Property,900.0,0,0,0,0,1,0,12


In [4]:
train_df = df.drop(columns=['pooja room', 'study room', 'others','price_per_sqft','society'])

In [5]:
train_df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_score
0,flat,sector 86,1.58,3,3,1,16.0,Relatively New,1750.0,0,0,2,152
1,flat,sector 85,1.2,2,2,3,12.0,Relatively New,1484.0,0,0,2,157
2,house,sector 57,6.0,4,5,2,2.0,Old Property,2700.0,1,0,2,20
3,flat,sector 82,0.86,2,2,2,12.0,Relatively New,1294.0,0,0,2,135
4,house,sector 7,0.45,3,2,1,2.0,Old Property,900.0,0,0,0,12


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3554 entries, 0 to 3553
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3554 non-null   object 
 1   sector           3554 non-null   object 
 2   price            3554 non-null   float64
 3   bedRoom          3554 non-null   int64  
 4   bathroom         3554 non-null   int64  
 5   balcony          3554 non-null   object 
 6   floorNum         3554 non-null   float64
 7   agePossession    3554 non-null   object 
 8   built_up_area    3554 non-null   float64
 9   servant room     3554 non-null   int64  
 10  store room       3554 non-null   int64  
 11  furnishing_type  3554 non-null   int64  
 12  luxury_score     3554 non-null   int64  
dtypes: float64(3), int64(6), object(4)
memory usage: 361.1+ KB


In [7]:
train_df['furnishing_type'].value_counts()

furnishing_type
0    2342
2    1021
1     191
Name: count, dtype: int64

In [8]:
#Converting nominal column to categorical  
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
train_df['furnishing_type'] = train_df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [9]:
#Converting nominal column to categorical  
def categorize_floor(floor):
    if 0 <= floor <= 2:
        return "Low Floor"
    elif 3 <= floor <= 10:
        return "Mid Floor"
    elif 11 <= floor <= 51:
        return "High Floor"
    else:
        return None  # or "Undefined" or any other label for floors outside the defined bins

In [10]:
train_df['floor_category'] = train_df['floorNum'].apply(categorize_floor)

In [11]:
#Converting nominal column to categorical  
def categorize_luxury(score):
    if 0 <= score < 50:
        return "Low"
    elif 50 <= score < 150:
        return "Medium"
    elif 150 <= score <= 175:
        return "High"
    else:
        return None  # or "Undefined" or any other label for scores outside the defined bins

In [12]:
train_df['luxury_category'] = train_df['luxury_score'].apply(categorize_luxury)

In [13]:
# Removing the nominal column after creating new categorical column
train_df.drop(columns=['floorNum','luxury_score'],inplace=True)

In [14]:
train_df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,floor_category,luxury_category
0,flat,sector 86,1.58,3,3,1,Relatively New,1750.0,0,0,furnished,High Floor,High
1,flat,sector 85,1.2,2,2,3,Relatively New,1484.0,0,0,furnished,High Floor,High
2,house,sector 57,6.0,4,5,2,Old Property,2700.0,1,0,furnished,Low Floor,Low
3,flat,sector 82,0.86,2,2,2,Relatively New,1294.0,0,0,furnished,High Floor,Medium
4,house,sector 7,0.45,3,2,1,Old Property,900.0,0,0,unfurnished,Low Floor,Low


In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3554 entries, 0 to 3553
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3554 non-null   object 
 1   sector           3554 non-null   object 
 2   price            3554 non-null   float64
 3   bedRoom          3554 non-null   int64  
 4   bathroom         3554 non-null   int64  
 5   balcony          3554 non-null   object 
 6   agePossession    3554 non-null   object 
 7   built_up_area    3554 non-null   float64
 8   servant room     3554 non-null   int64  
 9   store room       3554 non-null   int64  
 10  furnishing_type  3554 non-null   object 
 11  floor_category   3554 non-null   object 
 12  luxury_category  3554 non-null   object 
dtypes: float64(2), int64(4), object(7)
memory usage: 361.1+ KB


In [72]:
# y = original price
y = train_df['price']

# log-transformed target
y_log = np.log1p(y)     
X = train_df.drop('price', axis=1)

# For StratifiedKFold: bin the target into quantiles
n_strata = 10
y_bins = pd.qcut(y_log, q=n_strata, labels=False, duplicates='drop')


### Common encoders, helper for stratified CV, scorer, model dict

In [None]:
numeric_cols = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

cat_all = [
    'property_type',
    'sector',
    'balcony',
    'agePossession',
    'furnishing_type',
    'luxury_category',
    'floor_category'
]

In [None]:
ordinal_enc = OrdinalEncoder(
    handle_unknown='use_encoded_value',
    unknown_value=-1
)   

onehot_enc = OneHotEncoder(
    drop='first',
    handle_unknown='ignore',
    sparse_output=False
)   


In [None]:
def stratified_r2(pipeline, X, y_log, y_bins, n_splits=10):
    """Evaluates a regression pipeline using Stratified K-Fold on binned target values and returns mean & std R2."""
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) 
    scores = []

    for train_idx, val_idx in skf.split(X, y_bins): 
        X_train_cv = X.iloc[train_idx]
        X_val_cv = X.iloc[val_idx]
        y_train_cv = y_log.iloc[train_idx]
        y_val_cv = y_log.iloc[val_idx]

        pipeline.fit(X_train_cv, y_train_cv)         
        y_val_pred_log = pipeline.predict(X_val_cv) 
        scores.append(r2_score(y_val_cv, y_val_pred_log))  

    return np.mean(scores), np.std(scores) 


In [None]:
def scorer(model_name, model, preprocessor, X, y_log, y_bins, n_splits=10):
    """Evaluates a model using Stratified CV for R2 and a hold-out test for MAE, returning model name, mean R2, and MAE."""
    
    output = []
    output.append(model_name)  
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    mean_r2, std_r2 = stratified_r2(
        pipeline, X, y_log, y_bins, n_splits=n_splits
    )
    output.append(mean_r2)
    X_train, X_test, y_train_log, y_test_log = train_test_split(
        X, y_log, test_size=0.2, random_state=42
    )

    pipeline.fit(X_train, y_train_log) 

    y_pred_log = pipeline.predict(X_test)  
    y_pred = np.expm1(y_pred_log)         
    y_test = np.expm1(y_test_log)

    mae = mean_absolute_error(y_test, y_pred)  
    output.append(mae)

    return output 


In [73]:
# Dictionary of all regression models to evaluate in a loop
model_dict = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(max_iter=500),
    'xgboost': XGBRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='reg:squarederror',
        random_state=42
    )
}


### ordinal encoder

In [74]:
# Build a ColumnTransformer that applies scaling to numeric columns
# and Ordinal Encoding (with unknown handling) to categorical columns
preprocessor_ordinal = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', ordinal_enc, cat_all)
    ],
    remainder='passthrough'
)

pipeline_ord_lr = Pipeline([
    ('preprocessor', preprocessor_ordinal),
    ('regressor', LinearRegression())
])

mean_r2_ord, std_r2_ord = stratified_r2(
    pipeline_ord_lr, X, y_log, y_bins, n_splits=10
)
print("Ordinal + LinearRegression R2 mean:", mean_r2_ord)
print("Ordinal + LinearRegression R2 std:", std_r2_ord)

X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

pipeline_ord_lr.fit(X_train, y_train_log)

y_pred_log = pipeline_ord_lr.predict(X_test)
y_pred = np.expm1(y_pred_log)             
y_test = np.expm1(y_test_log)

mae_ord = mean_absolute_error(y_test, y_pred)

print("Ordinal + LinearRegression MAE:", mae_ord)


Ordinal + LinearRegression R2 mean: 0.7370624334404401
Ordinal + LinearRegression R2 std: 0.01719206319745994
Ordinal + LinearRegression MAE: 0.8595077074077766


In [75]:
# List to store evaluation results for each model
model_output_ordinal = []

for model_name, model in model_dict.items():
    model_output_ordinal.append(
        scorer(model_name, model, preprocessor_ordinal, X, y_log, y_bins)
    )
model_df_ordinal = pd.DataFrame(
    model_output_ordinal, columns=['name', 'r2', 'mae']
)

model_df_ordinal.sort_values('mae')


Unnamed: 0,name,r2,mae
10,xgboost,0.898663,0.47593
5,random forest,0.878832,0.509157
7,gradient boosting,0.87108,0.557573
6,extra trees,0.864786,0.572434
4,decision tree,0.765576,0.665476
9,mlp,0.796675,0.773851
8,adaboost,0.763892,0.810807
1,svr,0.731899,0.829067
2,ridge,0.737066,0.859418
0,linear_reg,0.737062,0.859508


### One hot + Oridnal Encoding

In [76]:
# Preprocessor: scale numeric features, ordinal-encode all cats, and one-hot encode selected categorical columns
preprocessor_ohe = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat_ord', ordinal_enc, cat_all),
        ('cat_ohe', onehot_enc, ['sector', 'agePossession', 'furnishing_type'])
    ],
    remainder='passthrough'
)

# Create pipeline combining preprocessing + Linear Regression model
pipeline_ohe_lr = Pipeline([
    ('preprocessor', preprocessor_ohe),
    ('regressor', LinearRegression())
])

mean_r2_ohe, std_r2_ohe = stratified_r2(
    pipeline_ohe_lr, X, y_log, y_bins, n_splits=10
)
print("OHE + LinearRegression R2 mean:", mean_r2_ohe)
print("OHE + LinearRegression R2 std:", std_r2_ohe)

X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

pipeline_ohe_lr.fit(X_train, y_train_log)

y_pred_log = pipeline_ohe_lr.predict(X_test)
y_pred = np.expm1(y_pred_log)                    
y_test = np.expm1(y_test_log)

mae_ohe = mean_absolute_error(y_test, y_pred)

print("OHE + LinearRegression MAE:", mae_ohe)




OHE + LinearRegression R2 mean: 0.8557564982991991
OHE + LinearRegression R2 std: 0.012169297186356951
OHE + LinearRegression MAE: 0.6111307657438351




In [77]:
# Store evaluation results (R2 and MAE) for each model using the OHE preprocessor
model_output_ohe = []

for model_name, model in model_dict.items():
    model_output_ohe.append(
        scorer(model_name, model, preprocessor_ohe, X, y_log, y_bins)
    )

model_df_ohe = pd.DataFrame(
    model_output_ohe, columns=['name', 'r2', 'mae']
)

model_df_ohe.sort_values('mae')




Unnamed: 0,name,r2,mae
10,xgboost,0.902066,0.456134
5,random forest,0.890512,0.480212
6,extra trees,0.89086,0.48632
9,mlp,0.871123,0.536025
7,gradient boosting,0.873127,0.546432
4,decision tree,0.803839,0.609695
0,linear_reg,0.855756,0.611131
2,ridge,0.856119,0.615363
8,adaboost,0.75483,0.804427
1,svr,0.73398,0.828632


### OHE + PCA

In [78]:
 # Preprocessor: scale numeric features, ordinal-encode all categorical features,
# and one-hot encode selected high-cardinality columns
preprocessor_ohe_pca = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat_ord', ordinal_enc, cat_all),
        ('cat_ohe', OneHotEncoder(
            drop='first',
            handle_unknown='ignore',
            sparse_output=False
        ), ['sector', 'agePossession'])
    ],
    remainder='passthrough'
)

pipeline_ohe_pca_lr = Pipeline([
    ('preprocessor', preprocessor_ohe_pca),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

mean_r2_ohe_pca, std_r2_ohe_pca = stratified_r2(
    pipeline_ohe_pca_lr, X, y_log, y_bins, n_splits=10
)
print("OHE + PCA + LinearRegression R2 mean:", mean_r2_ohe_pca)
print("OHE + PCA + LinearRegression R2 std:", std_r2_ohe_pca)

X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)
pipeline_ohe_pca_lr.fit(X_train, y_train_log)

y_pred_log = pipeline_ohe_pca_lr.predict(X_test)
y_pred = np.expm1(y_pred_log)
mae_ohe_pca = mean_absolute_error(np.expm1(y_test_log), y_pred)

print("OHE + PCA + LinearRegression MAE:", mae_ohe_pca)




OHE + PCA + LinearRegression R2 mean: 0.05981341718512382
OHE + PCA + LinearRegression R2 std: 0.020021550611819584
OHE + PCA + LinearRegression MAE: 1.4609657964267686




In [79]:
# Store evaluation results (R2 and MAE) for each model using the OHE + PCA preprocessor
model_output_ohe_pca = []

for model_name, model in model_dict.items():
    model_output_ohe_pca.append(
        scorer(model_name, model, preprocessor_ohe_pca, X, y_log, y_bins)
    )

model_df_ohe_pca = pd.DataFrame(
    model_output_ohe_pca, columns=['name', 'r2', 'mae']
)

model_df_ohe_pca.sort_values('mae')




Unnamed: 0,name,r2,mae
10,xgboost,0.903059,0.46459
6,extra trees,0.893166,0.469912
5,random forest,0.889573,0.483431
7,gradient boosting,0.872765,0.546548
4,decision tree,0.801325,0.605138
0,linear_reg,0.8559,0.611277
9,mlp,0.863885,0.611662
2,ridge,0.856246,0.615842
8,adaboost,0.764238,0.817336
1,svr,0.733785,0.828672


### Target encoder

In [97]:
# !pip install category_encoders

In [82]:
import category_encoders as ce

In [83]:
# Target Encoding for the sector column and prepares the data for training the model.
target_enc = ce.TargetEncoder()

preprocessor_target = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat_ord', ordinal_enc, cat_all),
        ('cat_ohe_age', OneHotEncoder(
            drop='first',
            handle_unknown='ignore',
            sparse_output=False
        ), ['agePossession']),
        ('target_sector', target_enc, ['sector'])
    ],
    remainder='passthrough'
)


In [84]:
# Train and evaluate Linear Regression using Target Encoding for sector
pipeline_target_lr = Pipeline([
    ('preprocessor', preprocessor_target),
    ('regressor', LinearRegression())
])

mean_r2_target, std_r2_target = stratified_r2(
    pipeline_target_lr, X, y_log, y_bins, n_splits=10
)
print("TargetEnc + LinearRegression R2 mean:", mean_r2_target)
print("TargetEnc + LinearRegression R2 std:", std_r2_target)

X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)
pipeline_target_lr.fit(X_train, y_train_log)

y_pred_log = pipeline_target_lr.predict(X_test)
y_pred = np.expm1(y_pred_log)
mae_target = mean_absolute_error(np.expm1(y_test_log), y_pred)

print("TargetEnc + LinearRegression MAE:", mae_target)


TargetEnc + LinearRegression R2 mean: 0.8219299994966496
TargetEnc + LinearRegression R2 std: 0.01740008253028129
TargetEnc + LinearRegression MAE: 0.702070178478537


In [85]:
# Evaluate all models using the Target Encoding preprocessor and compare their R2 and MAE
model_output_target = []

for model_name, model in model_dict.items():
    model_output_target.append(
        scorer(model_name, model, preprocessor_target, X, y_log, y_bins)
    )

model_df_target = pd.DataFrame(
    model_output_target, columns=['name', 'r2', 'mae']
)

model_df_target.sort_values('mae')


Unnamed: 0,name,r2,mae
10,xgboost,0.904841,0.460231
6,extra trees,0.900631,0.475632
5,random forest,0.899759,0.476682
7,gradient boosting,0.887386,0.511506
9,mlp,0.841037,0.602779
4,decision tree,0.814347,0.674449
2,ridge,0.821959,0.701969
0,linear_reg,0.82193,0.70207
8,adaboost,0.81789,0.706186
1,svr,0.746744,0.810285


## Hypterparameter tuning

In [86]:
# Base XGBoost regressor 
xgb_base = XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    random_state=42
)

pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor_target),
    ('regressor', xgb_base)
])


In [87]:
param_distributions = {
    'regressor__n_estimators': [200, 400, 600, 800],
    'regressor__max_depth': [3, 4, 5, 6, 8],
    'regressor__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0],
    'regressor__min_child_weight': [1, 3, 5, 7],
    'regressor__gamma': [0, 0.1, 0.3, 0.5],
    'regressor__reg_alpha': [0, 0.01, 0.1, 1],
    'regressor__reg_lambda': [0.8, 1.0, 1.2]
}


In [88]:
# Simple K-Fold is fine for tuning; you already use stratified eval separately
cv_kfold = KFold(n_splits=5, shuffle=True, random_state=42)

xgb_search = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_distributions,
    n_iter=40,                 # you can increase to 60–100 if you want
    scoring='r2',
    cv=cv_kfold,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_search.fit(X, y_log)

print("Best R2 (CV):", xgb_search.best_score_)
print("Best params:")
for k, v in xgb_search.best_params_.items():
    print(f"{k}: {v}")


Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best R2 (CV): 0.9071885301574305
Best params:
regressor__subsample: 0.8
regressor__reg_lambda: 0.8
regressor__reg_alpha: 0.1
regressor__n_estimators: 800
regressor__min_child_weight: 1
regressor__max_depth: 8
regressor__learning_rate: 0.03
regressor__gamma: 0
regressor__colsample_bytree: 0.6


In [89]:
# Train-test split on log target (same as before)
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# Best tuned pipeline
best_xgb_pipeline = xgb_search.best_estimator_

best_xgb_pipeline.fit(X_train, y_train_log)

# Predictions in log space
y_pred_log = best_xgb_pipeline.predict(X_test)

# Convert back to original price scale
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

mae_xgb_tuned = mean_absolute_error(y_test, y_pred)
r2_xgb_tuned = r2_score(y_test_log, y_pred_log)   # R2 in log space

print("Tuned XGBoost R2 (log target):", r2_xgb_tuned)
print("Tuned XGBoost MAE (original price):", mae_xgb_tuned)


Tuned XGBoost R2 (log target): 0.9192068197771606
Tuned XGBoost MAE (original price): 0.4559661904409129


### Exporting the model

In [100]:
import joblib
best_xgb_pipeline = xgb_search.best_estimator_
best_xgb_pipeline.fit(X, y_log)
joblib.dump(best_xgb_pipeline, "model/price_prediction.pkl")

['model/price_prediction.pkl']

In [105]:
with open('model/df.pkl', 'wb') as file:
    joblib.dump(X, file)

In [102]:
loaded_pipeline = joblib.load("model/price_prediction.pkl")
new_data = pd.DataFrame([
    {
        "property_type": "house",
        "sector": "sector 102",
        "bedRoom": 4,
        "bathroom": 3,
        "balcony": "3+",
        "agePossession": "New Property",
        "built_up_area": 2750,
        "servant room": 0,
        "store room": 0,
        "furnishing_type": "unfurnished",
        "luxury_category": "Low",
        "floor_category": "Low Floor"
    }
])



In [103]:
y_pred_log = loaded_pipeline.predict(new_data)   # prediction in log space
y_pred = np.expm1(y_pred_log)                   # convert back to original price (crore)
print("Predicted price (crore):", y_pred[0])


Predicted price (crore): 2.9484086


## 1. Dataset Overview

The dataset train_df contains:

3554 rows

13 columns

No missing values

Feature Types

7 categorical features:
property_type, sector, balcony, agePossession, furnishing_type, floor_category, luxury_category

4 numeric features:
bedRoom, bathroom, built_up_area, servant room, store room

Target column:
price (in crore)

Target Transformation

The target (price) was right-skewed

Applying log-transform (log1p) made the distribution more normal

This improved training stability and R² performance

## 2. Hyperparameter Tuning Summary (XGBoost)

A RandomizedSearchCV was run with:

40 parameter combinations

5-fold Stratified CV

200 total fits

Best Parameters Found
subsample: 0.8
reg_lambda: 0.8
reg_alpha: 0.1
n_estimators: 800
min_child_weight: 1
max_depth: 8
learning_rate: 0.03
gamma: 0
colsample_bytree: 0.6

## 3. Final Evaluation on Hold-Out Test Set
Model: Tuned XGBoost Regressor
Metric	Result
R² (log target)	0.919206
MAE (crore)	0.455966
MAE (lakh)	≈ 45.6 lakh
Interpretation

The model explains 91.9% of the variance in log-price.

Average prediction error ≈ 45.6 lakh.

The model performs well, but falls short of the target MAE of 0.20 crore (20 lakh).

## 4. Why MAE Did Not Reach 0.20

Real estate data has high variance.

Location (sector) is extremely granular (131 unique sectors).

Many properties with similar features have very different prices.

Noise, outliers, and limited engineered features restrict model accuracy.

## 5. Key Suggestions to Reduce MAE Further
### Suggestion 1 — Add More Data

### Suggestion 2 - do feature engineering on new dataset


## 6. Final Conclusion

Best Model: Tuned XGBoost Regressor

Best R²: 0.9192

Best MAE: 0.456 crore (≈ 45.6 lakh)
