<h1 style='color:purple' align='center'>Data Science Regression Project: Predicting Home Prices in Bengaluru</h1>

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

<h2 style='color:blue'>Data Load: Load banglore home prices into a dataframe</h2>

In [None]:
df = pd.read_csv("bengaluru_house_prices.csv")
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['area_type'].unique()

In [None]:
df['area_type'].value_counts()

## Distribution of House Prices

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of House Prices (Lakh ₹)')
plt.xlabel('Price (Lakh ₹)')
plt.ylabel('Count')
plt.grid(True)
plt.show()


## Area Type Counts

In [None]:
sns.countplot(x='area_type', data=df)
plt.title('Count of Properties by Area Type')
plt.xlabel('Area Type')
plt.ylabel('Number of Properties')
plt.xticks(rotation=45)
plt.show()


## Top 10 Most Common Locations

In [None]:
top_locations = df['location'].value_counts().head(10)
top_locations.plot(kind='bar', color='green')
plt.title('Top 10 Locations with Most Listings')
plt.xlabel('Location')
plt.ylabel('Number of Properties')
plt.xticks(rotation=45)
plt.show()


## Location vs Average Price (Top 20 Locations)

In [None]:
location_price = df.groupby('location')['price'].mean().sort_values(ascending=False).head(20)
plt.figure(figsize=(12, 6))
sns.barplot(x=location_price.values, y=location_price.index, palette='viridis')
plt.title('Top 20 Locations by Average Price')
plt.xlabel('Average Price (Lakh ₹)')
plt.ylabel('Location')
plt.grid(True)
plt.tight_layout()
plt.show()


## Price vs Total Sqft

In [None]:
# Clean total_sqft if necessary (some values are ranges or non-numeric)
def convert_sqft(x):
    try:
        return float(x)
    except:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        else:
            return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna(subset=['total_sqft', 'price'])

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x='total_sqft', y='price', data=df, alpha=0.5, color='green')
plt.title('Price vs Total Sqft')
plt.xlabel('Total Sqft')
plt.ylabel('Price (Lakh ₹)')
plt.grid(True)
plt.show()


**Drop features that are not required to build our model**

## Boxplot: Price Distribution by Number of Bathrooms

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(x='bath', y='price', data=df)
plt.title('Price Distribution by Number of Bathrooms')
plt.xlabel('Number of Bathrooms')
plt.ylabel('Price (Lakh ₹)')
plt.show()


## Top Availabililty Status Categories


In [None]:
sns.countplot(data=df, y='availability', order=df['availability'].value_counts().head(10).index, palette='viridis')
plt.title('Top Availability Status Categories')
plt.xlabel('Count')
plt.ylabel('Availability')
plt.show()


In [None]:
df1 = df.drop(['area_type','society','balcony','availability'],axis='columns')
df1.shape

<h2 style='color:blue'>Data Cleaning: Handle NA values</h2>

In [None]:
df1.isnull().sum()

In [None]:
df1.shape

In [None]:
df2 = df1.dropna()
df2.isnull().sum()

In [None]:
df2.shape

In [None]:
df2['size'].unique()

<h2 style='color:blue'>Feature Engineering</h2>

**Add new feature(integer) for bhk (Bedrooms Hall Kitchen)**

In [None]:
df2 = df2.copy()  # Create a copy of the DataFrame
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))
df2.bhk.unique()

**Explore total_sqft feature**

In [None]:
df2['total_sqft'].unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
2+3

In [None]:
df2[~df2['total_sqft'].apply(is_float)].head(10)

**Above shows that total_sqft can be a range (e.g. 2100-2850). For such case we can just take average of min and max value in the range. There are other cases such as 34.46Sq. Meter which one can convert to square ft using unit conversion. I am going to just drop such corner cases to keep things simple**

In [None]:
def convert_sqft_to_num(x):
    try:
        if isinstance(x, str):
            # If it's a range like "2100 - 2850"
            if '-' in x:
                tokens = x.split('-')
                if len(tokens) == 2:
                    return (float(tokens[0]) + float(tokens[1])) / 2
            # If it's a clean number
            return float(x)
        # If it's already a number, return it
        return float(x)
    except:
        return None  # return None if conversion fails


In [None]:
convert_sqft_to_num('2166')

In [None]:
convert_sqft_to_num('2100 - 2850')

In [None]:
convert_sqft_to_num('34.46Sq. Meter')

In [None]:
df3 = df2.copy()
df3.total_sqft = df3.total_sqft.apply(convert_sqft_to_num)
df3 = df3[df3.total_sqft.notnull()]
df3.head(3)

**For below row, it shows total_sqft as 2475 which is an average of the range 2100-2850**

In [None]:
df3.loc[30]

In [None]:
(2100+2850)/2

<h2 style="color:blue">Feature Engineering</h2>

**Add new feature called price per square feet**

In [None]:
df4 = df3.copy()
df4['price_per_sqft'] = df4['price']*100000/df4['total_sqft']
df4.head()

In [None]:
df4_stats = df4['price_per_sqft'].describe()
df4_stats

In [None]:
df4.to_csv("bhp.csv",index=False)

**Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of locations**

In [None]:
len(df4.location.unique())

In [None]:
df4.location = df4.location.apply(lambda x: x.strip())

location_stats = df4['location'].value_counts(ascending=False)
location_stats

In [None]:
location_stats.values.sum()

In [None]:
len(location_stats[location_stats>10])

In [None]:
len(location_stats)

In [None]:
len(location_stats[location_stats<=10])

<h2 style="color:blue">Dimensionality Reduction</h2>

**Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns**

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
len(df4.location.unique())

In [None]:
df4.location = df4.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(df4.location.unique())


In [None]:
df4.head(10)

<h2 style="color:blue">Outlier Removal Using Business Logic</h2>

**As a data scientist when you have a conversation with your business manager (who has expertise in real estate), he will tell you that normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft**

In [None]:
df4[df4.total_sqft/df4.bhk<300].head()

**Check above data points. We have 6 bhk apartment with 1020 sqft. Another one is 8 bhk and total sqft is 600. These are clear data errors that can be removed safely**

In [None]:
df.shape

In [None]:
df5 = df4[~(df4.total_sqft/df4.bhk<300)]
df5.shape

<h2 style='color:blue'>Outlier Removal Using Standard Deviation and Mean</h2>

In [None]:
df5.price_per_sqft.describe()

**Here we find that min price per sqft is 267 rs/sqft whereas max is 12000000, this shows a wide variation in property prices. We should remove outliers per location using mean and one standard deviation**

In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df6 = remove_pps_outliers(df5)
df6.shape

**Let's check if for a given location how does the 2 BHK and 3 BHK property prices look like**

In [None]:
def plot_scatter_chart(df,location):
    bhk2 = df[(df.location==location) & (df.bhk==2)]
    bhk3 = df[(df.location==location) & (df.bhk==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
    plt.show()
    
plot_scatter_chart(df6,"Rajaji Nagar")

In [None]:
plot_scatter_chart(df6,"Hebbal")

**We should also remove properties where for same location, the price of (for example) 3 bedroom apartment is less than 2 bedroom apartment (with same square ft area). What we will do is for a given location, we will build a dictionary of stats per bhk, i.e.**
```
{
    '1' : {
        'mean': 4000,
        'std: 2000,
        'count': 34
    },
    '2' : {
        'mean': 4300,
        'std: 2300,
        'count': 22
    },    
}
```
**Now we can remove those 2 BHK apartments whose price_per_sqft is less than mean price_per_sqft of 1 BHK apartment**

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df7 = remove_bhk_outliers(df6)
# df7 = df6.copy()
df7.shape

**Plot same scatter chart again to visualize price_per_sqft for 2 BHK and 3 BHK properties**

In [None]:
plot_scatter_chart(df7,"Rajaji Nagar")

In [None]:
plot_scatter_chart(df7,"Hebbal")

**Based on above charts we can see that data points highlighted in red below are outliers and they are being removed due to remove_bhk_outliers function**

In [None]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df7.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")
plt.show()

<h2 style='color:blue'>Outlier Removal Using Bathrooms Feature</h2>

In [None]:
df7.bath.unique()

In [None]:
plt.hist(df7.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")
plt.show()

In [None]:
df7[df7.bath>10]

**It is unusual to have 2 more bathrooms than number of bedrooms in a home**

In [None]:
df7[df7.bath>df7.bhk+2]

**Again the business manager has a conversation with you (i.e. a data scientist) that if you have 4 bedroom home and even if you have bathroom in all 4 rooms plus one guest bathroom, you will have total bath = total bed + 1 max. Anything above that is an outlier or a data error and can be removed**

In [None]:
df8 = df7[df7.bath<df7.bhk+2]
df8.shape

In [None]:
df8.head(3)

In [None]:
df9 = df8.drop(['size','price_per_sqft'],axis='columns')
df9.head(3)

<h2 style='color:blue'>Use One Hot Encoding For Location</h2>

In [None]:
dummies = pd.get_dummies(df9.location)
dummies.head(3)

In [None]:
df10 = pd.concat([df9,dummies.drop('other',axis='columns')],axis='columns')
df10.head()

In [None]:
df11 = df10.drop('location',axis='columns')
df11.head(2)

<h2 style='color:blue'>Build a Model Now...</h2>

In [None]:
df11.shape

In [None]:
X = df11.drop(['price'],axis='columns')
X.head(3)

In [None]:
X.shape

In [None]:
y = df11.price
y.head(3)

In [None]:
len(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)


In [None]:
model.score(X_test,y_test)

<h2 style='color:blue'>Use K Fold cross validation to measure accuracy of our LinearRegression model</h2>

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

**We can see that in 5 iterations we get a score above 80% all the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose**

<h2 style='color:blue'>Find best model using GridSearchCV</h2>

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, ShuffleSplit
import pandas as pd

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'positive': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [1, 2, 5, 10]
            }
        },
        'elasticnet': {
            'model': ElasticNet(),
            'params': {
                'alpha': [0.1, 1, 5],
                'l1_ratio': [0.1, 0.5, 0.9]
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [None, 10, 20]
            }
        },
        'gradient_boosting': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5]
            }
        },
        'svr': {
            'model': SVR(),
            'params': {
                'kernel': ['rbf', 'linear'],
                'C': [1, 10],
                'epsilon': [0.1, 0.2]
            }
        },
        'knn': {
            'model': KNeighborsRegressor(),
            'params': {
                'n_neighbors': [3, 5, 7]
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])


find_best_model_using_gridsearchcv(X,y)

**Based on above results we can say that LinearRegression gives the best score. Hence we will use that.**

## Evaluation Metrics

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


## Actual vs Predicted Plot

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='royalblue')
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted House Prices")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.grid(True)
plt.show()


## Residual Plot:

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 5))
plt.hist(residuals, bins=40, color='mediumseagreen', edgecolor='black')
plt.title("Residual Distribution")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


In [None]:
X.columns

In [None]:
np.where(X.columns=='2nd Phase Judicial Layout')[0][0]

<h2 style='color:blue'>Test the model for few properties</h2>

In [None]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return model.predict([x])[0]

In [None]:
predict_price('1st Phase JP Nagar',1000, 2, 2)

In [None]:
predict_price('1st Phase JP Nagar',1000, 3, 3)

In [None]:
predict_price('Indira Nagar',1000, 2, 2)

In [None]:
predict_price('Indira Nagar',1000, 3, 3)

<h2 style='color:blue'>Export the tested model to a pickle file</h2>

In [None]:
import pickle
with open('bengaluru_home_prices_model.pickle','wb') as f:
    pickle.dump(model, f)



<h2 style='color:blue'>Export location and column information to a file that will be useful later on in our prediction application</h2>

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))