## Importing the libraries

In [8]:
import pandas as pd
import numpy as np
import re
# from sklearn.impute import SimpleImputer

## Reading the data

In [9]:
test=pd.read_csv("Predicting-House-Prices-In-Bengaluru-Test-Data.csv")
train=pd.read_csv("Predicting-House-Prices-In-Bengaluru-Train-Data.csv")

## Checking for missing values

In [10]:
train.isnull().sum()/len(train)*100

area_type        0.000000
availability     0.000000
location         0.007508
size             0.120120
society         41.306306
total_sqft       0.000000
bath             0.548048
balcony          4.572072
price            0.000000
dtype: float64

## Keeping the relevant features

In [11]:
X_features = ['area_type', 'availability', 'location', 'size', 'total_sqft', 'bath', 'balcony', 'price']
train = train[X_features]

## Fixing the Availability column

In [12]:
train['availability'] = train.availability.str.replace('\d+-', '')

## Converting the 'Total Sqft' into a numerical variable

In [13]:
train['total_sqft'] = train['total_sqft'].apply(lambda x:  np.mean(list(map(int, re.findall('\d+', x)))))

## Extracting numerical values from the 'size' feature

In [14]:
train['size'] = train['size'].loc[train['size'].notnull()].apply(lambda x: re.search(r'\d+', x).group(0)).astype(int)

## Imputing missing values for 'bath' & 'balcony'

In [15]:
train['size'].fillna(train['size'].median(), inplace=True)
train['bath'].fillna(train['bath'].median(), inplace=True)
train['balcony'].fillna(train['balcony'].median(), inplace=True)

## Removing rows with missing values

In [16]:
train.isnull().sum()/len(train)*100

area_type       0.000000
availability    0.000000
location        0.007508
size            0.000000
total_sqft      0.000000
bath            0.000000
balcony         0.000000
price           0.000000
dtype: float64

In [17]:
train = train.dropna()

## Using label encoding for categorical values

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X2 = train.apply(le.fit_transform)
X2.dtypes
X2.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,3,2,419,1,478,1,1,392
1,2,12,317,3,1574,4,3,1599
2,0,12,1179,2,877,1,3,1047
3,3,12,757,2,947,2,1,1520
4,3,12,716,1,632,1,1,767


# Trying One hot encoding

In [173]:
# import sys
# !{sys.executable} -m pip install category_encoders

In [174]:
import category_encoders as ce
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_train_ohe = ohe.fit_transform(train)
X_train_ohe.head()

Unnamed: 0,area_type_Super built-up Area,area_type_Plot Area,area_type_Built-up Area,area_type_Carpet Area,availability_Dec,availability_Ready To Move,availability_May,availability_Feb,availability_Nov,availability_Oct,...,location_Pattegarhpalya,location_Tilak Nagar,location_12th cross srinivas nagar banshankari 3rd stage,location_Havanur extension,location_Abshot Layout,size,total_sqft,bath,balcony,price
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,2.0,1056.0,2.0,1.0,39.07
1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,4.0,2600.0,5.0,3.0,120.0
2,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,3.0,1440.0,2.0,3.0,62.0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,3.0,1521.0,3.0,1.0,95.0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,2.0,1200.0,2.0,1.0,51.0


## Splitiing Train data to build model

In [19]:
from sklearn.model_selection import train_test_split
X = X2.drop(['price'], axis = 1)
y = X2[['price']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Preparing Model

In [153]:
#import sys

In [71]:
#!{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/1d/e7/5258cb787dc036f419ec57491decf8bfa89ab52c401b08b4b9228e43dc4c/xgboost-0.81-py2.py3-none-win_amd64.whl (7.4MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.81


In [13]:
from xgboost import XGBRegressor
regressor  = XGBRegressor()
regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [14]:
Y_pred_train = regressor.predict(X_train)

In [15]:
y_pred = regressor.predict(X_test)

In [16]:
def rmse(y_pred,y_test) :
    error = np.square(np.log10(y_pred +1) - np.log10(y_test +1)).mean() ** 0.5
    Acc = 1 - error
    return Acc

In [17]:
print("Accuracy attained on Training Set = ",rmse(Y_pred_train, np.array(y_train).reshape(-1)))
print("Accuracy attained on Test Set = ",rmse(y_pred, np.array(y_test).reshape(-1)))

Accuracy attained on Training Set =  nan
Accuracy attained on Test Set =  nan


  


## Using XGBoost with Cross Validation

In [20]:
# k-fold cross validation evaluation of xgboost model
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [21]:
model = xgboost.XGBClassifier()
kfold = KFold(n_splits=10, random_state=7)

In [None]:
results = cross_val_score(model, X_train, y_train, cv=kfold)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

## Using decision Tree Classifier

In [181]:
from sklearn.tree import DecisionTreeRegressor

In [182]:
X_dtr = X2.drop(['price'], axis = 1)
y_dtr = X2[['price']]
X_train_dtr, X_test_dtr, y_train_dtr, y_test_dtr = train_test_split(X_dtr, y_dtr, test_size = 0.2)

In [183]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train_dtr,y_train_dtr)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [184]:
Y_pred_dtr = dtr.predict(X_test_dtr)
Y_pred_dtr_train = dtr.predict(X_train_dtr)

In [185]:
print("Accuracy attained on Training Set = ",rmse(Y_pred_dtr_train, np.array(y_train).reshape(-1)))
print("Accuracy attained on Test Set = ",rmse(Y_pred_dtr,np.array(y_test).reshape(-1)))

Accuracy attained on Training Set =  -0.15149818300147944
Accuracy attained on Test Set =  -0.15671604709096298


In [42]:
feature_importances = pd.DataFrame(dtr.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance'])
impt_features = [ feature for feature in feature_importances.index if feature_importances.loc[feature].values > 0 ]
impt_features

## Predicting on the actual test data

### Preparing Test dataset

In [88]:
test = test[['area_type', 'location', 'size', 'total_sqft', 'bath', 'balcony']]
test['size'] = test['size'].loc[test['size'].notnull()].apply(lambda x: re.search(r'\d+', x).group(0)).astype(int)
test['size'].fillna(test['size'].median(), inplace=True)
test['bath'].fillna(test['bath'].mean(), inplace=True)
test['balcony'].fillna(test['balcony'].mean(), inplace=True)

In [89]:
test['total_sqft'] = test['total_sqft'].apply(lambda x:  np.mean(list(map(int, re.findall('\d+', x)))))

In [90]:
test.isnull().sum()/len(test)*100

area_type     0.0
location      0.0
size          0.0
total_sqft    0.0
bath          0.0
balcony       0.0
dtype: float64

In [91]:
X_test_ohe = ohe.transform(test)

In [98]:
data_new = pd.DataFrame()
for column in X_train.columns:
    if column in X_test_ohe:
        data_new[column] = X_test_ohe[column]
    else:
        data_new[column] = 0

In [31]:
# X_test = test.apply(le.fit_transform)

In [100]:
Y_pred_test = regressor.predict(data_new)

In [101]:
pd.DataFrame(Y_pred_test, columns = ['price']).to_excel("submission7.xlsx", sheet_name='house_price_sample', index = False)