In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import uniform, randint
import xgboost as xgb

# Load data
data = pd.read_csv('india_housing_prices.csv')

# Drop useless columns
if 'Unnamed: 22' in data.columns:
    data = data.drop(['Unnamed: 22'], axis=1)
if 'ID' in data.columns:
    data = data.drop(['ID'], axis=1)

# Drop rows with missing values
data = data.dropna()

# Selecting a broader set of relevant features
selected_features = [
    'City', 'Property_Type', 'BHK', 'Furnished_Status', 'Size_in_SqFt',
    'Year_Built', 'Floor_No', 'Age_of_Property',
    'Nearby_Schools', 'Nearby_Hospitals', 'Public_Transport_Accessibility',
    'Parking_Space', 'Security', 'Amenities', 'Facing', 'Owner_Type'
]

X = data[selected_features]
Y = np.log1p(data['Price_in_Lakhs'])  # log-transform target for stability

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# XGBoost regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_reg)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_dist = {
    'regressor__n_estimators': randint(100, 500),
    'regressor__max_depth': randint(3, 10),
    'regressor__learning_rate': uniform(0.01, 0.3),
    'regressor__subsample': uniform(0.7, 0.3),
    'regressor__colsample_bytree': uniform(0.7, 0.3)
}

search = RandomizedSearchCV(
    model, param_distributions=param_dist,
    n_iter=20, scoring='neg_root_mean_squared_error',
    cv=3, verbose=1, random_state=42, n_jobs=-1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

# Predict and evaluate
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # inverse of log1p
y_true = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"\nBest Parameters: {search.best_params_}")
print(f"Test RMSE: {rmse:.2f} Lakhs")
print(f"Test R² Score: {r2:.2f}")

# Function to predict from user input
def predict_price():
    print("\nEnter the following details to predict price:")
    input_data = {
        'City': str(input("City: ")),
        'Property_Type': str(input("Property Type: ")),
        'BHK': int(input("BHK: ")),
        'Furnished_Status': str(input("Furnished Status: ")),
        'Size_in_SqFt': float(input("Size in SqFt: ")),
        'Year_Built': int(input("Year Built: ")),
        'Floor_No': int(input("Floor No: ")),
        'Age_of_Property': int(input("Age of Property (years): ")),
        'Nearby_Schools': int(input("Nearby Schools (count): ")),
        'Nearby_Hospitals': int(input("Nearby Hospitals (count): ")),
        'Public_Transport_Accessibility': int(input("Public Transport Accessibility (score 1-10): ")),
        'Parking_Space': int(input("Parking Space (0/1): ")),
        'Security': int(input("Security (0/1): ")),
        'Amenities': int(input("Amenities (score 1-10): ")),
        'Facing': str(input("Facing (e.g., East/West): ")),
        'Owner_Type': str(input("Owner Type (e.g., Builder/Individual): "))
    }

    input_df = pd.DataFrame([input_data])

    # Ensure column types match training set
    for col in categorical_features:
        input_df[col] = input_df[col].astype(str)
    for col in numerical_features:
        input_df[col] = pd.to_numeric(input_df[col], errors='coerce')

    predicted_log_price = best_model.predict(input_df)[0]
    predicted_price = np.expm1(predicted_log_price)
    print(f"\nPredicted House Price: {predicted_price:.2f} Lakhs")

# Run prediction
if __name__ == '__main__':
    predict_price()

import joblib
joblib.dump(best_model,'xgb_model.pkl')

Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best Parameters: {'regressor__colsample_bytree': 0.8834959481464842, 'regressor__learning_rate': 0.012119891565915222, 'regressor__max_depth': 3, 'regressor__n_estimators': 148, 'regressor__subsample': 0.8574323980775167}
Test RMSE: 151.61 Lakhs
Test R² Score: -0.15

Enter the following details to predict price:


City:  mumbai
Property Type:  individual
BHK:  3
Furnished Status:  furnished
Size in SqFt:  3000
Year Built:  2010
Floor No:  12
Age of Property (years):  15
Nearby Schools (count):  3
Nearby Hospitals (count):  2
Public Transport Accessibility (score 1-10):  8
Parking Space (0/1):  1
Security (0/1):  1
Amenities (score 1-10):  8
Facing (e.g., East/West):  East
Owner Type (e.g., Builder/Individual):  Builder



Predicted House Price: 199.04 Lakhs


['xgb_model.pkl']