In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import json

# Load and clean data
def load_and_process_data():
    home = pd.read_csv("Bengaluru_House_Data.csv")
    home.drop(columns='society', inplace=True)
    df2 = home.drop(['area_type', 'balcony', 'availability'], axis=1)
    df3 = df2.dropna().copy()
    df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]) if isinstance(x, str) else None)

    def convert_sqft_to_num(x):
        if isinstance(x, str):
            tokens = x.split('-')
            if len(tokens) == 2:
                return (float(tokens[0]) + float(tokens[1])) / 2
            try:
                return float(x)
            except ValueError:
                return None
        return x

    df3['total_sqft'] = df3['total_sqft'].apply(convert_sqft_to_num)
    df3 = df3[df3['total_sqft'].notnull()]
    df3['price_per_sqft'] = df3['price'] * 100000 / df3['total_sqft']

    df3['location'] = df3['location'].apply(lambda x: x.strip())
    location_stats = df3['location'].value_counts()
    location_stats_less_than_10 = location_stats[location_stats <= 10]
    df3['location'] = df3['location'].apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

    df4 = df3[~(df3.total_sqft / df3.bhk < 300)]

    def remove_pps_outliers(df):
        df_out = pd.DataFrame()
        for key, subdf in df.groupby('location'):
            m = np.mean(subdf.price_per_sqft)
            st = np.std(subdf.price_per_sqft)
            reduced_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
            df_out = pd.concat([df_out, reduced_df], ignore_index=True)
        return df_out

    df5 = remove_pps_outliers(df4)

    def remove_bhk_outliers(df):
        exclude_indices = np.array([])
        for location, location_df in df.groupby('location'):
            bhk_stats = {}
            for bhk, bhk_df in location_df.groupby('bhk'):
                bhk_stats[bhk] = {
                    'mean': np.mean(bhk_df.price_per_sqft),
                    'std': np.std(bhk_df.price_per_sqft),
                    'count': bhk_df.shape[0]
                }
            for bhk, bhk_df in location_df.groupby('bhk'):
                stats = bhk_stats.get(bhk - 1)
                if stats and stats['count'] > 5:
                    exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < stats['mean']].index.values)
        return df.drop(exclude_indices, axis='index')

    df6 = remove_bhk_outliers(df5)
    df6 = df6[df6.bath < df6.bhk + 2]
    df7 = df6.drop(['size', 'price_per_sqft'], axis='columns')

    dummies = pd.get_dummies(df7['location'])
    df8 = pd.concat([df7, dummies.drop('other', axis='columns')], axis='columns')
    df9 = df8.drop('location', axis='columns')
    return df9

# Load data
data = load_and_process_data()
X = data.drop(['price'], axis='columns')
y = data['price']
feature_columns = list(X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Save model and columns
with open('model_pickle.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('columns.json', 'w') as f:
    json.dump(feature_columns, f)

def predict_price(location, sqft, bath, bhk):
    loc_index = feature_columns.index(location) if location in feature_columns else -1
    x = pd.DataFrame([np.zeros(len(feature_columns))], columns=feature_columns)
    x.iloc[0, 0] = sqft
    x.iloc[0, 1] = bath
    x.iloc[0, 2] = bhk
    if loc_index >= 0:
        x.iloc[0, loc_index] = 1
    return model.predict(x)[0]


# Example usage
price = predict_price('1st Phase JP Nagar', 1111, 4, 7)
print(f"Predicted price: {price}")


Predicted price: 92.68255105209798
