In [229]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os


In [231]:
# Step 1: Load Dataset
df = pd.read_csv("C:\\Users\\HP\\OneDrive\\Documents\\datasets\\Housing.csv")  
df.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [233]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [235]:
# Step 3: Clean Data
df = df[df['area'] < 10000]  # Remove huge areas


In [237]:
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

for col in binary_cols:
    df.loc[:, col] = df[col].map({'yes': 1, 'no': 0})


In [239]:
# Step 5: One-Hot Encode furnishingstatus
df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)


In [241]:
# Step 6: Feature Engineering 
df['price_per_sqft'] = df['price'] / df['area']

In [243]:
# Step 7: Prepare Features and Target
X = df.drop('price', axis=1)
y = df['price']


In [245]:
# Step 8: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [247]:
# Step 9: Random Forest Regressor with Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_
print(f" Best Parameters: {grid_search.best_params_}")


 Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [248]:
# Step 10: Model Evaluation
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n Mean Squared Error: {mse}")
print(f" R² Score (Accuracy): {r2}")

# Cross-validation score
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print(f" Cross-Validated R² Score: {cv_scores.mean():.2f}")



 Mean Squared Error: 371571755713.7052
 R² Score (Accuracy): 0.924451018248311
 Cross-Validated R² Score: -1.10


In [251]:
# Step 11: Save Trained Model
os.makedirs('model', exist_ok=True)
joblib.dump(best_model, 'model/house_price_model.pkl')
print("\n Model saved at: model/house_price_model.pkl")



 Model saved at: model/house_price_model.pkl


In [252]:
# Step 12: Example Prediction Input
new_data = {
    'area': 1300,
    'bedrooms': 3,
    'bathrooms': 2,
    'stories': 1,
    'mainroad': 1,
    'guestroom': 0,
    'basement': 0,
    'hotwaterheating': 0,
    'airconditioning': 0,
    'parking': 1,
    'prefarea': 1,
    'furnishingstatus_semi-furnished': 0,
    'furnishingstatus_unfurnished': 1,
    'price_per_sqft': 1300  # Example reasonable value
}

# Prepare input DataFrame
input_df = pd.DataFrame([new_data])

# Ensure all columns exist
for col in X.columns:
    if col not in input_df.columns:
        input_df[col] = 0

input_df = input_df[X.columns]  # Ensure column order


In [253]:
# Predict Price
predicted_price = best_model.predict(input_df)[0]
print(f"\n Predicted House Price: {predicted_price/1e5:.2f} Lakh")



 Predicted House Price: 30.00 Lakh
