In [1]:
import pandas as pd
import numpy as np

In [83]:
df = pd.read_csv('outlier_free_data.csv')
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Devarachikkanahalli,1250.0,2.0,40.0,2
1,Devarachikkanahalli,1200.0,2.0,83.0,2
2,Devarachikkanahalli,1170.0,2.0,40.0,2
3,Devarachikkanahalli,1425.0,2.0,65.0,3
4,Devarachikkanahalli,947.0,2.0,43.0,2


In [84]:
dummies = pd.get_dummies(df.location)
df1 = pd.concat([df, dummies.drop('other', axis = 1)], axis = 1) 
df1.drop('location', inplace = True, axis = 1)
df1.head()

Unnamed: 0,total_sqft,bath,price,bhk,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1250.0,2.0,40.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1200.0,2.0,83.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1170.0,2.0,40.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1425.0,2.0,65.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,947.0,2.0,43.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
df1.shape

(7253, 244)

In [86]:
X = df1.drop('price', axis = 1)
X.head()

Unnamed: 0,total_sqft,bath,bhk,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1250.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1200.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1170.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1425.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,947.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
y = df1.price
y.head()

0    40.0
1    83.0
2    40.0
3    65.0
4    43.0
Name: price, dtype: float64

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [89]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.888527158409591

In [92]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits = 10, test_size = 0.1, random_state = 1)

np.mean(cross_val_score(LinearRegression(), X, y, cv = cv))

0.8530624310704917

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge

In [94]:
algos = {
    'linear_regression': {
        'model': LinearRegression(),
        'params': {
            'normalize': [True, False]
        }
    },
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [1e-4, 1e-3, 0.01],
            'normalize': [True, False],
            'selection': ['cyclic', 'random']
        }
    },
    'ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100],
            'normalize': [True, False],
        }
    }
}

scores = []

for algo, config in algos.items():
    gs = GridSearchCV(estimator = config['model'], param_grid = config['params'], cv = cv, verbose = 1, n_jobs = -1)
    gs.fit(X, y)
    scores.append({
        'model': algo,
        'best_score': gs.best_score_,
        'best_params': gs.best_params_
    })
    
scores

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.9s finished


Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    8.1s finished


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.0s finished


[{'model': 'linear_regression',
  'best_score': 0.8530624310705335,
  'best_params': {'normalize': True}},
 {'model': 'lasso',
  'best_score': 0.8530769462151182,
  'best_params': {'alpha': 0.0001, 'normalize': True, 'selection': 'random'}},
 {'model': 'ridge',
  'best_score': 0.8547950716436581,
  'best_params': {'alpha': 0.1, 'normalize': True}}]

In [95]:
model = gs.best_estimator_
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9041548599295959

In [102]:
model.coef_[:3]

array([0.07087904, 5.35630296, 3.17429349])

In [96]:
def predict_price(location, sqft, bhk, bath):
    loc_index = np.where(X.columns == location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1
        
    return model.predict([x])[0]

In [119]:
predict_price('Yelahanka', 1200, 2, 2)

64.07678716501948

In [117]:
predict_price('HSR Layout', 1200, 2, 2)

57.09421112492048

In [118]:
predict_price('Indira Nagar', 1200, 2, 2)

192.89458675300398

In [123]:
predict_price('1st Phase JP Nagar', 1200, 2, 2)

97.35691701399652

In [124]:
predict_price('Electronic City', 1200, 2, 2)

65.11760046327787

In [125]:
predict_price('Whitefield', 1200, 2, 2)

71.71173835164856

In [129]:
predict_price('Marathahalli', 1200, 2, 2)

67.92709311642635

In [130]:
predict_price('Malleshpalya', 1200, 2, 2)

57.352816745921544

In [133]:
predict_price('Magadi Road', 1200, 2, 2)

54.341629493417486

In [131]:
predict_price('Kogilu', 1200, 2, 2)

56.46442915689997

In [132]:
predict_price('Koramangala', 1200, 2, 2)

147.74249489173718

In [128]:
X.columns[150:170]

Index(['Kogilu', 'Konanakunte', 'Koramangala', 'Kothannur', 'Kothanur',
       'Kudlu', 'Kudlu Gate', 'Kumaraswami Layout', 'Kundalahalli',
       'LB Shastri Nagar', 'Laggere', 'Lakshminarayana Pura',
       'Lingadheeranahalli', 'Magadi Road', 'Mahadevpura',
       'Mahalakshmi Layout', 'Mallasandra', 'Malleshpalya', 'Malleshwaram',
       'Marathahalli'],
      dtype='object')

In [134]:
import pickle
with open('final_model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [139]:
import json
columns = {
    'data_columns': [col.lower() for col in X.columns]
}

with open('columns.json', 'w') as f:
    f.write(json.dumps(columns))