# Housing Price Prediction

### Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [None]:
data = pd.read_csv("Data/bengaluru_house_prices.csv")

In [None]:
data.shape

In [None]:
data = data.drop(["area_type", "society", "balcony", "availability"], axis="columns")
data

### Data Cleaning

In [None]:
data.isnull().sum()

In [None]:
data = data.dropna()
data.isna().sum()

In [None]:
data['size'].unique()

In [None]:
data = data.copy()
data.loc[:, "BHK"] = data['size'].apply(lambda x: int(x.split(' ')[0]))

In [None]:
data["BHK"].unique()

In [None]:
data.total_sqft.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
data[~data["total_sqft"].apply(is_float)]

In [None]:
def convert_sqft_to_num(x):
    ranges = x.split('-')
    if len(ranges) == 2:
        return (float(ranges[0]) + float(ranges[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
data = data.copy()
data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)

In [None]:
data

### Feature Engineering

In [None]:
data = data.copy()

In [None]:
data['price_per_sqft'] = (data['price'] * 100000) / data['total_sqft']
data

In [None]:
len(data['location'].unique())

In [None]:
data.location = data.location.apply(lambda x: x.strip())
location_statistics = data.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_statistics

In [None]:
len(location_statistics[location_statistics <= 10])

In [None]:
location_statistics_less_than_10 = location_statistics[location_statistics <= 10]

In [None]:
data.location = data.location.apply(lambda x: 'Other' if x in location_statistics_less_than_10 else x)

In [None]:
data

### Outlier Removal

In [None]:
data[data.total_sqft/data.BHK < 300]

In [None]:
data = data[~(data.total_sqft/data.BHK < 300)]

In [None]:
data

In [None]:
data.price_per_sqft.describe()

In [None]:
def remove_outliers(dataFrame):
    data_frame_out = pd.DataFrame()
    for key, sub_data_frame in dataFrame.groupby('location'):
        mean = np.mean(sub_data_frame.price_per_sqft)
        std = np.std(sub_data_frame.price_per_sqft)
        reduced_data_frame = sub_data_frame[(sub_data_frame.price_per_sqft > (mean - std)) & 
                                            (sub_data_frame.price_per_sqft <= (mean + std))]
        data_frame_out = pd.concat([data_frame_out, reduced_data_frame], ignore_index=True)
    return data_frame_out

In [None]:
data = remove_outliers(data)

In [None]:
data

In [None]:
def plot_scatter_graph(data, location):
    bhk_2 = data[(data.location == location) & (data.BHK == 2)]
    bhk_3 = data[(data.location == location) & (data.BHK == 3)]
    plt.rcParams['figure.figsize'] = (10, 6)
    plt.scatter(bhk_2.total_sqft, bhk_2.price, color='blue', label='2 BHK', s=50)
    plt.scatter(bhk_3.total_sqft, bhk_3.price, marker='+', color='green', label='3 BHK', s=50)
    plt.xlabel("Total Sqaure Feet Area")
    plt.ylabel("Price")
    plt.title(location)
    plt.legend()

In [None]:
plot_scatter_graph(data, "Yeshwanthpur")

In [None]:
def remove_bhk_outliers(data):
    indices = np.array([])
    for location, location_data_frame in data.groupby('location'):
        bhk_statistics = {}
        for bhk, bhk_data_frame in location_data_frame.groupby('BHK'):
            bhk_statistics[bhk] = {
                'mean': np.mean(bhk_data_frame.price_per_sqft),
                'std': np.std(bhk_data_frame.price_per_sqft),
                'count':bhk_data_frame.shape[0]
            }
        for bhk, bhk_data_frame in location_data_frame.groupby('BHK'):
            statistics = bhk_statistics.get(bhk - 1)
            if statistics and statistics['count'] > 5:
                indices = np.append(indices, bhk_data_frame[bhk_data_frame.price_per_sqft < (statistics['mean'])].index.values)
    return data.drop(indices, axis='index')

In [None]:
data = remove_bhk_outliers(data)

In [None]:
data

In [None]:
plot_scatter_graph(data, "Yeshwanthpur")

In [None]:
plt.rcParams['figure.figsize'] = (10, 6)
plt.hist(data.price_per_sqft, rwidth=0.75)
plt.xlabel('Price Per Square Feet')
plt.ylabel('Count')
plt.show()

In [None]:
data.bath.unique()

In [None]:
plt.rcParams['figure.figsize'] = (10, 6)
plt.hist(data.bath, rwidth=0.75)
plt.xlabel('Price Per Square Feet')
plt.ylabel('Count')
plt.show()

In [None]:
data = data[data.bath < data.BHK + 2]
data

### Model Building

In [None]:
data = data.drop(['size', 'price_per_sqft'], axis='columns')

In [None]:
data

In [None]:
dummies = pd.get_dummies(data.location)

In [None]:
data = pd.concat([data, dummies.drop('Other', axis='columns')], axis='columns')

In [None]:
data = data.drop('location', axis='columns')

In [None]:
data

In [None]:
X = data.drop('price', axis='columns')
y = data.price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
model_linear_regression = LinearRegression()

In [None]:
model_linear_regression.fit(X_train, y_train)

In [None]:
model_linear_regression.score(X_test, y_test)

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

In [None]:
cross_val_score(LinearRegression(), X, y, cv=cv)

In [None]:
def find_best_model(X, y):
    algorithms = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params' : {}
        }, 
        'lasso':{
            'model':Lasso(),
            'params': {}
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {}
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algorithm_name, configuration in algorithms.items():
        grid_search = GridSearchCV(configuration['model'], configuration['params'], cv=cv, return_train_score=False)
        grid_search.fit(X, y)
        scores.append({
            'model': algorithm_name,
            'best_score': grid_search.best_score_,
            
        })
    return pd.DataFrame(scores, columns=['model', 'best_score'])

In [None]:
find_best_model(X, y)