Mumbai House Price Prediction Model

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("house_price_mumbai.csv")

data = df.drop(columns="project")
data.head()

Unnamed: 0,BHK,Location,City,Total sqft,price_sqft,price
0,3 BHK Apartment,Chembur,Mumbai,984,31000,3.05 Cr
1,2 BHK Apartment,Kurla,Mumbai,598,23913,1.42 Cr
2,2 BHK Apartment,Malad West,Mumbai,738,21000,1.54 Cr
3,3 BHK Apartment,Rasayani,Mumbai,644,10676,68.75 L
4,2 BHK Apartment,Vikhroli,Mumbai,582,24914,1.45 Cr


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3980 entries, 0 to 3979
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   BHK         3980 non-null   object
 1   Location    3980 non-null   object
 2   City        3980 non-null   object
 3   Total sqft  3980 non-null   int64 
 4   price_sqft  3980 non-null   object
 5   price       3980 non-null   object
dtypes: int64(1), object(5)
memory usage: 186.7+ KB


In [4]:
def convert_price(value):
    value = value.strip()
    if 'Cr' in value:
        return float(value.replace('Cr', '').strip()) * 1_00_00_000
    elif 'L' in value:
        return float(value.replace('L', '').strip()) * 1_00_000
    else:
        return float(value)  # If already numeric
    
def string_to_int(s):
    return sum(ord(ch) for ch in str(s))


data['Location'] = data['Location'].apply(string_to_int).astype('int64') # Converts the string of location to int by converting it to ASCII
data['City'] = data['City'].apply(string_to_int).astype('int64')
data['Type'] = data['BHK'].apply(lambda x: 'BHK' if 'BHK' in x else x)
data['Type'] = data['BHK'].apply(lambda x: 'RK' if 'RK' in x else x)
data['Type'] = data['BHK'].apply(lambda x: 'R' if 'R' in x else x)
# Extract numeric values (RK gets 0 here)
data['BHK'] = data['BHK'].str.extract(r'(\d+)').fillna(0).astype(int)

# Apply conversion
data['price'] = data['price'].apply(convert_price).astype('int64')

In [5]:
data


Unnamed: 0,BHK,Location,City,Total sqft,price_sqft,price,Type
0,3,710,603,984,31000,30500000,3 BHK Apartment
1,2,511,603,598,23913,14200000,2 BHK Apartment
2,2,930,603,738,21000,15400000,2 BHK Apartment
3,3,824,603,644,10676,6875000,3 BHK Apartment
4,2,840,603,582,24914,14500000,2 BHK Apartment
...,...,...,...,...,...,...,...
3975,2,500,603,966,4968,4800000,2 BHK Apartment
3976,1,1130,603,500,5200,2600000,1 BHK Apartment
3977,1,1130,603,610,5573,3400000,1 BHK Apartment
3978,1,1130,603,610,5245,3200000,1 BHK Apartment


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3980 entries, 0 to 3979
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   BHK         3980 non-null   int64 
 1   Location    3980 non-null   int64 
 2   City        3980 non-null   int64 
 3   Total sqft  3980 non-null   int64 
 4   price_sqft  3980 non-null   object
 5   price       3980 non-null   int64 
 6   Type        3980 non-null   object
dtypes: int64(5), object(2)
memory usage: 217.8+ KB


In [7]:
stats = data["price"].describe().reset_index()
stats["price"] = round(stats["price"], 1)

In [8]:
stats

Unnamed: 0,index,price
0,count,3980.0
1,mean,11224117.8
2,std,17087431.0
3,min,125000.0
4,25%,2829500.0
5,50%,6400000.0
6,75%,13300000.0
7,max,300000000.0


In [9]:
data.isna().sum()
# if there are NA values use data.dropna(inplace = True)

BHK           0
Location      0
City          0
Total sqft    0
price_sqft    0
price         0
Type          0
dtype: int64

In [10]:
data.duplicated().sum()
data.drop_duplicates(inplace=True)
# removing duplicate values

In [11]:
data.duplicated().sum()

np.int64(0)

In [12]:
data.columns

Index(['BHK', 'Location', 'City', 'Total sqft', 'price_sqft', 'price', 'Type'], dtype='object')

In [13]:
data.groupby("BHK")["price"].mean().sort_values(ascending=False)

BHK
6    1.325000e+08
4    6.110769e+07
5    3.928571e+07
3    2.850051e+07
2    1.355303e+07
1    5.571345e+06
0    1.795281e+06
Name: price, dtype: float64

In [14]:
data

Unnamed: 0,BHK,Location,City,Total sqft,price_sqft,price,Type
0,3,710,603,984,31000,30500000,3 BHK Apartment
1,2,511,603,598,23913,14200000,2 BHK Apartment
2,2,930,603,738,21000,15400000,2 BHK Apartment
3,3,824,603,644,10676,6875000,3 BHK Apartment
4,2,840,603,582,24914,14500000,2 BHK Apartment
...,...,...,...,...,...,...,...
3973,1,961,603,670,5223,3500000,1
3974,1,500,603,670,5223,3500000,1 BHK Apartment
3975,2,500,603,966,4968,4800000,2 BHK Apartment
3978,1,1130,603,610,5245,3200000,1 BHK Apartment


In [15]:
data.columns

Index(['BHK', 'Location', 'City', 'Total sqft', 'price_sqft', 'price', 'Type'], dtype='object')

In [16]:
X = data[['BHK', 'Location', 'City', 'Total sqft']]
y = data[["price"]]

In [17]:
X

Unnamed: 0,BHK,Location,City,Total sqft
0,3,710,603,984
1,2,511,603,598
2,2,930,603,738
3,3,824,603,644
4,2,840,603,582
...,...,...,...,...
3973,1,961,603,670
3974,1,500,603,670
3975,2,500,603,966
3978,1,1130,603,610


In [18]:
y

Unnamed: 0,price
0,30500000
1,14200000
2,15400000
3,6875000
4,14500000
...,...
3973,3500000
3974,3500000
3975,4800000
3978,3200000


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
from sklearn.tree import DecisionTreeRegressor

param_grid = {
    "criterion": ["mse", "friedman_mse"],
    "splitter": ["best", "random"],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4]
}

In [22]:
tree_model = DecisionTreeRegressor()

In [23]:
grid_tree = GridSearchCV(estimator = tree_model, param_grid=param_grid)

In [24]:
grid_tree.fit(X_train, y_train)

540 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\prasa\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\prasa\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\prasa\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_par

In [25]:
grid_tree.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 50,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'splitter': 'best'}

In [26]:
tree_pred = grid_tree.predict(X_test)

In [27]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# mean_squared_error(y_test, tree_pred)
mean_absolute_error(y_test, tree_pred)

4479647.271044997

In [28]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [29]:
lr.fit(X_train, y_train)

In [30]:
predlr = lr.predict(X_test)

In [31]:
mean_absolute_error(y_test, predlr)

7373196.375307114

In [32]:
from sklearn.ensemble import RandomForestRegressor
rfrmodel = RandomForestRegressor()

In [33]:
param_gridrfr = {
    "max_depth": [5,10,15],
    "n_estimators": [2,3,4,5,6,7,8,9,10]
}

In [34]:
gridrfr = GridSearchCV(rfrmodel, param_gridrfr)

In [35]:
gridrfr.fit(X_train, y_train.values.ravel())

In [36]:
gridrfr.best_params_

{'max_depth': 15, 'n_estimators': 3}

In [37]:
rfrpred = gridrfr.predict(X_test)

In [38]:
mean_absolute_error(y_test, rfrpred)

4727539.032475039

In [39]:
gridrfr

In [42]:
import joblib

joblib.dump(gridrfr, "model.pkl")

['model.pkl']

In [41]:
X.columns

Index(['BHK', 'Location', 'City', 'Total sqft'], dtype='object')