In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [None]:

def load_data(path):
    return pd.read_csv(path)

data = load_data("Bengaluru_House_Data.csv")
data.head()


In [None]:

def convert_sqft_to_num(x):
    try:
        if '-' in str(x):
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None


In [None]:

def clean_data(df):
    df = df.drop(columns=["society", "balcony", "availability"], errors='ignore')
    df['size'] = df['size'].fillna('0 BHK')
    df['bhk'] = df['size'].apply(lambda x: int(str(x).split(' ')[0]))
    df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
    df = df[df['total_sqft'].notnull()]
    df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']
    df = df.dropna(subset=['location'])
    df['location'] = df['location'].apply(lambda x: x.strip())
    return df

data = clean_data(data)
data.head()


In [None]:

def remove_outliers(df):
    df = df[df['bhk'] < 20]
    df = df[df['total_sqft']/df['bhk'] >= 300]
    df = df[(df['price_per_sqft'] >= 1000) & (df['price_per_sqft'] <= 10000)]
    return df

data = remove_outliers(data)


In [None]:

def encode_features(df):
    dummies = pd.get_dummies(df['location'], drop_first=True)
    df = pd.concat([df.drop('location', axis=1), dummies], axis=1)
    return df

data = encode_features(data)


In [None]:

def train_model(df):
    X = df.drop("price", axis=1)
    y = df["price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model, X

model, X = train_model(data)


In [None]:

def save_model(model, X, filename_model, filename_columns):
    with open(filename_model, "wb") as f:
        pickle.dump(model, f)

    columns = {"data_columns": list(X.columns)}
    with open(filename_columns, "w") as f:
        json.dump(columns, f)

save_model(model, X, "banglore_home_prices_model.pickle", "columns.json")


In [None]:

def predict_price(input_dict, model, data_columns):
    x = np.zeros(len(data_columns))
    x[0] = input_dict['total_sqft']
    x[1] = input_dict['bath']
    x[2] = input_dict['bhk']
    if input_dict['location'] in data_columns:
        loc_index = data_columns.index(input_dict['location'])
        x[loc_index] = 1
    return round(max(model.predict([x])[0], 0), 2)


In [None]:

# Example usage:
with open("columns.json", "r") as f:
    data_columns = json.load(f)['data_columns']

price = predict_price({
    "location": "Whitefield",
    "total_sqft": 1600,
    "bath": 3,
    "bhk": 4
}, model, data_columns)

print("Predicted Price: ₹", price, "Lakhs")
