In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('Bengaluru_House_Data.csv')
data.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'Bengaluru_House_Data.csv'

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.columns

In [None]:
data['area_type'].unique()

In [None]:
data['area_type'].value_counts()

### Removing the columns that are not required

In [None]:
data1 = data[['location','size','total_sqft','bath','balcony','price']]
data1

## Data Cleaning

In [None]:
data1.isnull().sum()

In [None]:
data1.dropna(inplace=True)
data1.shape

In [None]:
data1.isna().sum()

### Reframing the 'size' column with 'bhk' column

In [None]:
def get_bhk(value):
    bhk = value.split(' ')[0]
    return bhk
#get_bhk('2 BHK')

data1['bhk'] = data1['size'].apply(get_bhk)
data1

In [None]:
def isfloat(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
data1[data1['total_sqft'].apply(isfloat)]

In [None]:
data1[~data1['total_sqft'].apply(isfloat)]

In [None]:
def convert_sqft_to_num(x):
    nums=x.split('-')
    if len(nums)==2:
        return (float(nums[0])+float(nums[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
data1['total_sqft']=data1['total_sqft'].apply(convert_sqft_to_num)
data1

In [None]:
data1.loc[30]

In [None]:
data1['price_per_sqft'] = data1['price']*100000/data1['total_sqft']
data1

In [None]:
data1['location'].apply(lambda x: x.strip())
loc_stats = data1['location'].value_counts()
loc_stats

## Dimensionality Reduction
Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns

In [None]:
loc_stats_lt10 = loc_stats[loc_stats<=10]
loc_stats_lt10

In [None]:
def write_other(x):
    if x in loc_stats_lt10:
        return 'Other'
    else:
        return x
data1['location'] = data1['location'].apply(write_other)
data1.head(30)

In [None]:
len(data1.location.unique())

In [None]:
data1['bhk'] = data1['bhk'].astype(float)
data1

In [None]:
data1[data1.total_sqft/data1.bhk<300].head()

In [None]:
data1.shape

In [None]:
data2 = data1[~(data1.total_sqft/data1.bhk<300)]
data2.head()

In [None]:
data2.shape

In [None]:
data2['price_per_sqft'].describe()

In [None]:
def remove_pps_outliers(df):
    df_out=pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        reduced_df=subdf[(subdf.price_per_sqft>(m-st))& (subdf.price_per_sqft<(m+st))]
        df_out=pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
data3=remove_pps_outliers(data2)
data3.shape

### Removing Outliers in 'bhk'

In [None]:
import matplotlib.pyplot as plt
def plot_scatter_chart(df,location):
    bhk2=df[(df.location==location)&(df.bhk==2)]
    bhk3=df[(df.location==location)&(df.bhk==3)]
    plt.rcParams['figure.figsize']=(15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='Blue',label='2 BHK',s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,color='green',marker='+',label='3 BHK',s=50)
    plt.xlabel('Total Square Foot')
    plt.ylabel('Price')
    plt.title(location)
    plt.legend()
plot_scatter_chart(data3,"Rajaji Nagar")

In [None]:
def remove_bhk_outliers(df):
    exclude_indices=np.array([])
    for location, location_df in df.groupby('location'):
        bhk_sats={}
        for BHK,BHK_df in location_df.groupby('bhk'):
            bhk_sats[BHK]={
                'mean':np.mean(BHK_df.price_per_sqft),
                'std':np.std(BHK_df.price_per_sqft),
                'count':BHK_df.shape[0]
            }
        for BHK,BHK_df in location_df.groupby('bhk'):
            stats=bhk_sats.get(BHK-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,BHK_df[BHK_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

data4=remove_bhk_outliers(data3)
data4.shape

In [None]:
plot_scatter_chart(data4,"Rajaji Nagar")


## Outlier removal using bathroom feature

In [None]:
data2.bath.unique()

In [None]:
plt.rcParams['figure.figsize']=(20,15)
plt.hist(data4.bath,rwidth=0.6)
plt.xlabel("Number Of Bathroom")
plt.ylabel("Count")

In [None]:
data4[data4.bath>data4.bhk+2]

In [None]:
data5=data4[data4.bath<data4.bhk+2]
data5.shape

In [None]:
data6=data5.drop(['size','price_per_sqft'],axis='columns')
data6

In [None]:
dummies=pd.get_dummies(data6.location)
dummies.head(10)

In [None]:
data7=pd.concat([data6,dummies.drop('Other',axis='columns')],axis='columns')
data7.head()

In [None]:
data8=data7.drop('location',axis='columns')
data8.head()

In [None]:
data8.shape

In [None]:
X=data8.drop('price',axis='columns')
X.head()

In [None]:
y=data8.price

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

model=LinearRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

## Find best model using GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

In [None]:
X

In [None]:
def price_predict(location,sqft,bath,balcony,bhk):
    loc_index=np.where(X.columns==location)[0][0]
    x=np.zeros(len(X.columns))
    x[0]=sqft
    x[1]=bath
    x[2]=balcony
    x[3]=bhk
    if loc_index >=0:
        x[loc_index]=1
    return model.predict([x])[0]

In [None]:
price_predict('1st Phase JP Nagar',1000,2,2,2)

In [None]:
price_predict('1st Phase JP Nagar',1000,2,3,3)

In [None]:
price_predict('Indira Nagar',1000,2,2,2)

## Exporting the tested model to a pickle file

In [None]:
import pickle
with open('banglore_house_prices_model.pickle','wb') as f:
    pickle.dump(model,f)

### Export location and column information to a file that will be useful later on in our prediction application

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))