In [39]:
import pandas as pd
import numpy as np
import pickle as pk
import json as js
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,\
ShuffleSplit, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Lasso

In [40]:
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../DataCleaning/States/final_state')

In [3]:
df.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850,4.0,1.0,428,4,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1630,3.0,2.0,194,3,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1875,2.0,3.0,235,3,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1200,2.0,0.0,130,3,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1235,2.0,2.0,148,2,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
X = df.drop('price', axis='columns')
y = df.price

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=20
)

In [6]:
lr = LinearRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.7981335833849785

In [7]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(estimator=LinearRegression(), X=X, y=y, cv=cv)

array([0.82431653, 0.77184843, 0.85123787, 0.80889708, 0.83601791])

In [22]:
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {
            'positive': [False, True],
            'n_jobs': [1, 3, 6, 9, 12]
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'splitter': ['best', 'random'],
            'criterion': ['gini', 'log_loss', 'entropy']
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [1.0, 2.0],
            'selection': ['random', 'cyclic']
        }
    }
}

In [23]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)

In [24]:
scores = []
for model_name, mp in models.items():
    G = GridSearchCV(estimator=mp['model'], param_grid=mp['params'],
                     cv=cv, return_train_score=False, error_score='raise')
    G.fit(X, y)
    scores.append({
        'model_name': model_name,
        'best_score': G.best_score_,
        'best_params': G.best_params_
    })

In [25]:
GRResult = pd.DataFrame(scores, columns=['model_name','best_score','best_params'])

In [26]:
GRResult

Unnamed: 0,model_name,best_score,best_params
0,LinearRegression,0.837054,"{'n_jobs': 1, 'positive': False}"
1,DecisionTreeClassifier,0.130466,"{'criterion': 'gini', 'splitter': 'random'}"
2,Lasso,0.702445,"{'alpha': 1.0, 'selection': 'cyclic'}"


In [27]:
model = LinearRegression(n_jobs=1, positive=False)

In [28]:
model.fit(X, y)
model.score(X, y)

0.8542639564372384

In [41]:
def predict_price(location, total_sqft, bath, balcony, bhk):
    location_column_index = np.where(X.columns==location)[0][0]
    x = np.zeros(len(X.columns))
    x[0] = total_sqft
    x[1] = bath
    x[2] = balcony
    x[3] = bhk
    x[location_column_index] = 1
    return model.predict([x])[0]

In [46]:
print(predict_price('1st Phase JP Nagar', 1000, 2, 1, 2))
print(predict_price('1st Phase JP Nagar', 1000, 3, 1, 3))
print(predict_price('Indira Nagar', 1000, 2, 1, 2))
print(predict_price('Indira Nagar', 1200, 3, 2, 4))

86.26814261633932
88.43795630169133
180.21861541204845
195.9008750405153


In [48]:
with open('banglore_home_prices_model.pickle', 'wb') as f:
    pk.dump(model, f)

In [50]:
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(js.dumps(columns))