In [1]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv("bengaluru_house_prices.csv")
data.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [2]:
# Drop rows with missing values
data_cleaned = data.dropna()

# Remove rows with ambiguous data
data_cleaned = data_cleaned[~data_cleaned['size'].str.contains('Studio', na=False)]
data_cleaned = data_cleaned[data_cleaned['bath'] < 10]


In [3]:
# Convert size into number of bedrooms
data_cleaned["bhk"] = data_cleaned["size"].apply(lambda x: int(x.split(" ")[0]))

# Keep only required columns
df_model = data_cleaned[["total_sqft", "bath", "bhk", "location", "price"]]
df_model = df_model[df_model["total_sqft"].apply(lambda x: str(x).replace(".", "").isdigit())]
df_model["total_sqft"] = df_model["total_sqft"].astype(float)

df_model = df_model[df_model["price"] < 200]  # Filter out extreme prices
df_model = df_model[df_model["total_sqft"] < 10000]

In [4]:
# One-hot encode location
dummies = pd.get_dummies(df_model.location)

df_final = pd.concat([df_model.drop("location", axis=1), dummies], axis=1)
X = df_final.drop("price", axis=1)
y = df_final.price


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)


In [None]:
import pickle

with open("bangalore_home_prices_model.pkl", "wb") as f:
    pickle.dump(model, f)


In [None]:
import json

columns = {
    "data_columns": X.columns.tolist()
}

with open("columns.json", "w") as f:
    f.write(json.dumps(columns))
