In [21]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [22]:
df = pd.read_csv("MagicBricks.csv")

In [23]:
df.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0


In [24]:
df1 = df.drop(["Parking", "Furnishing", "Type"], axis="columns")
df2 = df1.dropna()
df2.head()

Unnamed: 0,Area,BHK,Bathroom,Locality,Price,Status,Transaction,Per_Sqft
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",5000000,Ready_to_move,New_Property,6667.0
2,950.0,2,2.0,"Citizen Apartment, Rohini Sector 13",15500000,Ready_to_move,Resale,6667.0
3,600.0,2,2.0,Rohini Sector 24,4200000,Ready_to_move,Resale,6667.0
4,650.0,2,2.0,Rohini Sector 24 carpet area 650 sqft status R...,6200000,Ready_to_move,New_Property,6667.0
5,1300.0,4,3.0,Rohini Sector 24,15500000,Ready_to_move,New_Property,6667.0


In [25]:
# Detect and remove outliers using IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Removing outliers for 'Area' and 'Price'
df_cleaned = remove_outliers(df2, 'Area')
df_cleaned = remove_outliers(df_cleaned, 'Price')

# Adding a new feature - Price per Square Foot
df_cleaned['Price_per_Sqft'] = df_cleaned['Price'] / df_cleaned['Area']

# Check the cleaned data
df_cleaned.describe()


Unnamed: 0,Area,BHK,Bathroom,Price,Per_Sqft,Price_per_Sqft
count,901.0,901.0,901.0,901.0,901.0,901.0
mean,1136.082936,2.63485,2.342952,14728410.0,13811.629301,14045.994523
std,586.662491,0.874875,0.887586,12283960.0,17410.236897,23452.406271
min,28.0,1.0,1.0,1000000.0,1667.0,481.363367
25%,720.0,2.0,2.0,4620000.0,6111.0,6347.826087
50%,1044.0983,3.0,2.0,11500000.0,10000.0,10493.82716
75%,1500.0,3.0,3.0,22000000.0,15556.0,16283.524904
max,2970.0,7.0,6.0,52500000.0,183333.0,538461.538462


In [26]:
df_cleaned=df_cleaned.drop(["Status","Transaction"], axis = "columns")
df_cleaned.head(1)

Unnamed: 0,Area,BHK,Bathroom,Locality,Price,Per_Sqft,Price_per_Sqft
1,750.0,2,2.0,"J R Designers Floors, Rohini Sector 24",5000000,6667.0,6666.666667


In [27]:
location_names = sorted(df_cleaned["Locality"].unique().tolist())

In [28]:
df_cleaned["Locality_Encoded"] = df_cleaned["Locality"].map(
    df_cleaned.groupby("Locality")["Price"].mean().to_dict()
)
df_final = df_cleaned.drop("Locality", axis="columns")


In [29]:
# Create the mapping from locality to its average price
locality_price_map = df_cleaned.groupby("Locality")["Price"].mean().to_dict()


In [30]:
locality_price_map

{'Abhimanyu Apartments, Vasundhara Enclave': 11600000.0,
 'Abul Fazal Enclave Part 1, Okhla': 5183333.333333333,
 'Abul Fazal Enclave Part-II, Okhla': 4133333.3333333335,
 'Adarsh Homes, Dwarka Mor': 3000000.0,
 'Ahinsha Vatika, Ram Nagar, Shahdara': 7300000.0,
 'Alaknanda': 15262500.0,
 'Amar Colony, Lajpat Nagar': 13000000.0,
 'Andheria Mor, Mehrauli': 5700000.0,
 'Anekant Apartment, Vasundhara Enclave': 9800000.0,
 'Anupam Enclave, Saket': 19000000.0,
 'Apna Apartments, Savitri Nagar Village, Sheikh Sarai': 4600000.0,
 'Aravali Tower, Chhattarpur': 5990000.0,
 'Archana Apartment, Paschim Vihar': 8600000.0,
 'Arjun Nagar, Safdarjung Enclave': 5175000.0,
 'Ashirwad Apartments, Dwarka': 15600000.0,
 'Ashirwad Apartments, Dwarka Sector 12': 15500000.0,
 'Ashoka Apartment, Paschim Vihar Block A2': 6500000.0,
 'Ashoka Apartments, Dwarka Sector 12': 15800000.0,
 'Ashoka Enclave Apartment, Dwarka Sector 11': 19200000.0,
 'Balbir Nagar, Shahdara': 4500000.0,
 'Baljit Nagar, Patel Nagar': 155

In [31]:
X = df_final.drop("Price", axis="columns")
y = df_final["Price"]

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print(f"R² Score: {r2_score(y_test, y_pred)}")


R² Score: 0.9690984234570781


In [32]:
print(X.columns)



Index(['Area', 'BHK', 'Bathroom', 'Per_Sqft', 'Price_per_Sqft',
       'Locality_Encoded'],
      dtype='object')


In [33]:
import numpy as np

def predict_price(locality, area, bhk, bath):
    locality_avg_price = locality_price_map.get(locality, np.mean(list(locality_price_map.values())))

    # Use mean values for missing features
    per_sqft_mean = X["Per_Sqft"].mean()
    price_per_sqft_mean = X["Price_per_Sqft"].mean()

    # Create input array with correct feature order
    x = np.array([area, bhk, bath, per_sqft_mean, price_per_sqft_mean, locality_avg_price]).reshape(1, -1)

    return round(rf_model.predict(x)[0], 2)

# Test the function
print(predict_price("Rohini Sector 25", 800, 3, 2))
print(predict_price("J R Designers Floors, Rohini Sector 24", 750, 2, 2))
print(predict_price("Citizen Apartment, Rohini Sector 13", 900, 2, 2))

10726900.0
9370300.0
13150000.0




In [39]:
columns_data = {"data_columns": ["Area", "BHK", "Bathroom"] + location_names}

import json
with open("./columns.json", "w") as f:
        json.dump(columns_data, f)

print("✅ columns.json updated with real locations!")

✅ columns.json updated with real locations!


In [None]:
import pickle

# Path to save the model
model_path = './final_random_forest_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(rf_model, f)

print(f"Model saved at: {model_path}")


In [None]:
import json

# Save the locality price map to a JSON file
with open('locality_price_map.json', 'w') as f:
    json.dump(locality_price_map, f, indent=4)

print("Locality price map saved as 'locality_price_map.json'")


In [None]:
print(X.columns)
X["Locality_Encoded"].head()