# <center> Machine Hack Challenge <center>
# <center> Predicting House Prices In Bengaluru <center>

## Load libraries

In [43]:
import re
import pickle
import numpy as np
import pandas as pd

import sklearn
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

import xgboost
import lightgbm

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Path to the datasets

In [2]:
PATH = 'Dataset/'
PATH_TO_train_data = PATH + 'train.csv'
PATH_TO_test_data = PATH + 'test.csv'
PATH_TO_sample_submission = PATH + 'sample_submission.xlsx'

## Preprocessing

In [3]:
def preprocess_total_sqft(my_list):
    if len(my_list) == 1:
        
        try:
            return float(my_list[0])
        except:
            strings = ['Sq. Meter', 'Sq. Yards', 'Perch', 'Acres', 'Cents', 'Guntha', 'Grounds']
            split_list = re.split('(\d*.*\d)', my_list[0])[1:]
            area = float(split_list[0])
            type_of_area = split_list[1]
            
            if type_of_area == 'Sq. Meter':
                area_in_sqft = area * 10.7639
            elif type_of_area == 'Sq. Yards':
                area_in_sqft = area * 9.0
            elif type_of_area == 'Perch':
                area_in_sqft = area * 272.25
            elif type_of_area == 'Acres':
                area_in_sqft = area * 43560.0
            elif type_of_area == 'Cents':
                area_in_sqft = area * 435.61545
            elif type_of_area == 'Guntha':
                area_in_sqft = area * 1089.0
            elif type_of_area == 'Grounds':
                area_in_sqft = area * 2400.0
            return float(area_in_sqft)
        
    else:
        return (float(my_list[0]) + float(my_list[1]))/2.0

In [4]:
train_data = pd.read_csv(PATH_TO_train_data)

In [5]:
test_data = pd.read_csv(PATH_TO_test_data)

In [6]:
train_data.shape

(13320, 9)

In [7]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [8]:
train_data.area_type.value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

### Convert the categorical values in the column 'area_type' into numerical data.
- There are 4 unique categories. So, replace them with numericals (0-4).

In [9]:
replace_area_type = {'Super built-up  Area': 0, 'Built-up  Area': 1, 'Plot  Area': 2, 'Carpet  Area': 3}
train_data['area_type'] = train_data.area_type.map(replace_area_type)

In [10]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,0,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


### Convert the categorical values in the column 'availability' into 3 categories.
- Ready to move
- Immediate Possession
- Others

In [11]:
def replace_availabilty(my_string):
    if my_string == 'Ready To Move':
        return 0
    elif my_string == 'Immediate Possession':
        return 1
    else:
        return 2

In [12]:
train_data['availability'] = train_data.availability.apply(replace_availabilty)

In [13]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,1,0,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,0,0,Kothanur,2 BHK,,1200,2.0,1.0,51.0


### Preprocess the column 'location'

In [14]:
train_data[~train_data.location.notnull()]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
568,0,0,,3 BHK,Grare S,1600,3.0,2.0,86.0


In [15]:
train_data['location'] = train_data['location'].fillna('Location not provided')

### Preprocess the column 'size'
- Convert all the categories into numeric data using LabelEncoder.

In [16]:
size_encoder = LabelEncoder()
size_encoder.fit(train_data['size'].astype('str').append(test_data['size'].astype('str')))
train_data['size'] = size_encoder.transform(train_data['size'].astype('str'))

In [17]:
size_encoder.classes_

array(['1 BHK', '1 Bedroom', '1 RK', '10 BHK', '10 Bedroom', '11 BHK',
       '11 Bedroom', '12 Bedroom', '13 BHK', '14 BHK', '16 BHK',
       '16 Bedroom', '18 Bedroom', '19 BHK', '2 BHK', '2 Bedroom',
       '27 BHK', '3 BHK', '3 Bedroom', '4 BHK', '4 Bedroom', '43 Bedroom',
       '5 BHK', '5 Bedroom', '6 BHK', '6 Bedroom', '7 BHK', '7 Bedroom',
       '8 BHK', '8 Bedroom', '9 BHK', '9 Bedroom', 'nan'], dtype=object)

In [18]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,14,Coomee,1056,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,20,Theanmp,2600,5.0,3.0,120.0
2,1,0,Uttarahalli,17,,1440,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,17,Soiewre,1521,3.0,1.0,95.0
4,0,0,Kothanur,14,,1200,2.0,1.0,51.0


### Remove the column "society"

In [19]:
train_data = train_data.drop(columns='society', axis=1)

In [20]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,14,1056,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,20,2600,5.0,3.0,120.0
2,1,0,Uttarahalli,17,1440,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,17,1521,3.0,1.0,95.0
4,0,0,Kothanur,14,1200,2.0,1.0,51.0


### Preprocess the column "total_sqft"
- As all the given values are not measured in 'square feet', we have to preprocess some of the values in the column. 
- Some of them are also measured in square meters, square yards, perch, acres, cents, guntha and grounds. 
- So, convert all of them into square feet to make the data more consistent.

In [21]:
train_data['total_sqft'] = train_data.total_sqft.str.split('-').apply(preprocess_total_sqft)

### Preprocess the column 'bath'
- There are many missing values (73) in the column 'bath'.
- So, the missing values are filled by grouping the rows based on location and taking the mean of the column 'bath' in that location.
- Even after doing this, there is a missing value. This is because there is a row in which the location is unique(occurred only one time) and the value is NaN. So, it cannot fill as there are no other values for bathrooms to find mean. In this case, the missing values are filled with the mean of the whole column.

In [22]:
train_data['bath'].isna().sum()

73

In [23]:
column_bath = train_data.groupby('location')['bath'].transform(lambda x: x.fillna(x.mean()))

In [24]:
column_bath[~column_bath.notnull()]

1775   NaN
Name: bath, dtype: float64

In [25]:
column_bath = column_bath.fillna(column_bath.mean())
column_bath.isna().sum()

0

- All the missing values are filled successfully.

In [26]:
train_data['bath'] = column_bath

### Preprocess the column 'balcony'
- There are many missing values (609) in the column 'bath'.
- So, the missing values are filled by grouping the rows based on location and taking the mean of the column 'balcony' in that location.
- Even after doing this, there are missing value in some rows. This is because there are rows in which the location is unique(occurred only one time) and the value is NaN. So, it cannot fill as there are no other values for balcony to find mean. In this case, the missing values are filled with the mean of the whole column.

In [27]:
train_data.balcony.isna().sum()

609

In [28]:
train_data.balcony.value_counts()

2.0    5113
1.0    4897
3.0    1672
0.0    1029
Name: balcony, dtype: int64

In [29]:
column_balcony = train_data.groupby('location')['balcony'].transform(lambda x: x.fillna(x.mean()))
column_balcony = column_balcony.fillna(column_balcony.mean())

In [30]:
column_balcony.isna().sum()

0

In [31]:
train_data['balcony'] = column_balcony

In [32]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,0,2,Electronic City Phase II,14,1056.0,2.0,1.0,39.07
1,2,0,Chikka Tirupathi,20,2600.0,5.0,3.0,120.0
2,1,0,Uttarahalli,17,1440.0,2.0,3.0,62.0
3,0,0,Lingadheeranahalli,17,1521.0,3.0,1.0,95.0
4,0,0,Kothanur,14,1200.0,2.0,1.0,51.0


### Preprocess the column 'location'
- Use LabelEncoder to convert different locations into numericals.

In [33]:
location_encoder = LabelEncoder()
location_encoder.fit(train_data['location'].append(test_data['location']))
train_data['location'] = location_encoder.transform(train_data['location'])

In [34]:
location_encoder.classes_

array([' Anekal', ' Banaswadi', ' Basavangudi', ..., 'whitefiled',
       'yelahanka, north', 'yettagodi Road'], dtype=object)

### Preprocessed train data
- train data
- X_train
- y_train

In [35]:
train_data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,0,2,430,14,1056.0,2.0,1.0,39.07
1,2,0,325,20,2600.0,5.0,3.0,120.0
2,1,0,1220,17,1440.0,2.0,3.0,62.0
3,0,0,778,17,1521.0,3.0,1.0,95.0
4,0,0,736,14,1200.0,2.0,1.0,51.0


In [36]:
columns = train_data.columns
X_train = train_data[columns[:-1]]
y_train = train_data[columns[-1]]

## Preprocess test data

In [37]:
test_data = pd.read_csv(PATH_TO_test_data)

- We have to handle missing values in the test data as it has alot of missing values.

In [38]:
test_data.isna().sum()

area_type          0
availability       0
location           0
size               2
society          626
total_sqft         0
bath               7
balcony           69
price           1480
dtype: int64

In [39]:
test_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,Ready To Move,Brookefield,2 BHK,Roeekbl,1225,2.0,2.0,
1,Plot Area,Ready To Move,Akshaya Nagar,9 Bedroom,,2400,9.0,2.0,
2,Plot Area,18-Apr,Hennur Road,4 Bedroom,Saandtt,1650,5.0,2.0,
3,Super built-up Area,Ready To Move,Kodichikkanahalli,3 BHK,Winerri,1322,3.0,1.0,
4,Super built-up Area,Ready To Move,Konanakunte,2 BHK,AmageSa,1161,2.0,1.0,


In [40]:
test_data['area_type'] = test_data.area_type.map(replace_area_type)

test_data['availability'] = test_data.availability.apply(replace_availabilty)

test_data['location'] = location_encoder.transform(test_data['location'].astype('str'))

test_data['size'] = size_encoder.transform(test_data['size'].astype('str'))

test_data = test_data.drop(columns='society')

test_data['total_sqft'] = test_data.total_sqft.str.split('-').apply(preprocess_total_sqft)

test_data['bath'] = test_data['bath'].fillna(train_data.bath.mean())

test_data['balcony'] = test_data['balcony'].fillna(train_data.balcony.mean())

test_data = test_data.drop(columns='price')

In [41]:
X_test = test_data
X_test.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony
0,0,0,284,14,1225.0,2.0,2.0
1,2,0,103,31,2400.0,9.0,2.0
2,2,2,534,20,1650.0,5.0,2.0
3,0,0,721,17,1322.0,3.0,1.0
4,0,0,727,14,1161.0,2.0,1.0


## Modelling

In [42]:
# rfRegressor = RandomForestRegressor()
# model = rfRegressor.fit(X_train, y_train)

In [44]:
# lreg = LinearRegression(normalize=True)
# model = lreg.fit(X_train, y_train)

In [45]:
# dtReg = DecisionTreeRegressor()
# model = dtReg.fit(X_train, y_train)

- n_estimators: 500, max_depth: 6 - Score: 86189
- n_estimators: 1000, max_depth: 8 - Score: 86694247
- n_estimators: 1300, max_depth: 8 - Score: 86775
- Highest - 867978
- 86309

In [58]:
xgb = xgboost.XGBRegressor(n_estimators=2000, learning_rate=0.1, gamma=0, subsample=0.60,
                           colsample_bytree=1, max_depth=8)
model = xgb.fit(X_train, y_train)

In [59]:
model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=2000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.6)

In [60]:
y_pred = model.predict(X_test)

In [61]:
y_pred

array([ 66.802895, 441.86896 , 289.47552 , ...,  44.056988,  60.741074,
        70.68911 ], dtype=float32)

In [62]:
out_df = pd.DataFrame({'price': y_pred})

In [63]:
out_df.to_excel('predictions.xlsx', index=False)