In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.metrics import r2_score, mean_squared_error
import pickle
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('houses_train.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,condition,district,max_floor,street,num_rooms,region,area,url,num_bathrooms,building_type,floor,ceiling_height
0,4598,100000.0,newly repaired,Arabkir,6,Kievyan St,3,Yerevan,96.0,http://www.myrealty.am/en/item/26229/3-senyaka...,1,stone,4,3.0
1,5940,52000.0,good,Arabkir,14,Mamikoniants St,3,Yerevan,78.0,http://www.myrealty.am/en/item/32897/3-senyaka...,1,panel,10,2.8
2,2302,52000.0,newly repaired,Qanaqer-Zeytun,9,M. Melikyan St,3,Yerevan,97.0,http://www.myrealty.am/en/item/1459/apartment-...,1,panel,1,2.8
3,5628,130000.0,good,Center,4,Spendiaryan St,3,Yerevan,80.0,http://www.myrealty.am/en/item/2099/3-senyakan...,1,stone,2,3.2
4,760,81600.0,zero condition,Center,9,Ler. Kamsar St,3,Yerevan,107.0,http://www.myrealty.am/en/item/22722/3-senyaka...,1,monolit,9,3.0


In [4]:
Y = df[['price']]
X = df.drop(['Unnamed: 0', 'price', 'region', 'url'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)

In [6]:
X_train['floor'] = X_train['floor']/X_train['max_floor']
X_train.drop('max_floor', axis=1, inplace=True)
X_train

Unnamed: 0,condition,district,street,num_rooms,area,num_bathrooms,building_type,floor,ceiling_height
447,good,Davtashen,Davtashen 2 district,3,73.0,1,panel,0.777778,2.8
3433,zero condition,Qanaqer-Zeytun,Rubinyants St,3,78.0,1,stone,1.000000,2.8
818,newly repaired,Avan,Tsarav Aghbyur St,2,56.0,1,monolit,0.222222,3.0
3634,good,Shengavit,Sharur St,4,132.0,1,stone,0.750000,3.2
4237,good,Malatia-Sebastia,Z.Andranik St,1,78.0,1,panel,0.666667,2.8
...,...,...,...,...,...,...,...,...,...
4040,newly repaired,Arabkir,Mamikoniants St,4,120.0,1,stone,1.000000,3.0
4647,newly repaired,Avan,Narekatsi district,3,85.0,1,panel,0.928571,2.8
3967,good,Arabkir,Arabkir 19 St,3,91.0,1,panel,0.692308,2.8
222,newly repaired,Malatia-Sebastia,Raffi St,3,63.0,1,panel,0.600000,2.8


In [7]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
transformed = ohe.fit_transform(X_train[X_train.columns[[0, 1, 2, 6]]]) #transform columns 'condition', 'district', 'street', 'building_type'
columns = np.array(ohe.categories_, dtype='object')
columns = np.concatenate((columns[0], columns[1], columns[2], columns[3])) 

In [8]:
transformed = pd.DataFrame(transformed, columns=columns)

In [9]:
transformed

Unnamed: 0,good,newly repaired,zero condition,Achapnyak,Arabkir,Avan,Center,Davtashen,Erebuni,Malatia-Sebastia,...,Yerznkyan St,Z. Sarkavag 3 dead end,Z. Sarkavag St,Z.Andranik St,Zakyan St,Zavaryan St,monolit,other,panel,stone
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4495,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4496,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4497,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4498,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
X_train['index'] = range(0, len(transformed))
X_train.set_index('index', inplace=True)
X_train

Unnamed: 0_level_0,condition,district,street,num_rooms,area,num_bathrooms,building_type,floor,ceiling_height
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,good,Davtashen,Davtashen 2 district,3,73.0,1,panel,0.777778,2.8
1,zero condition,Qanaqer-Zeytun,Rubinyants St,3,78.0,1,stone,1.000000,2.8
2,newly repaired,Avan,Tsarav Aghbyur St,2,56.0,1,monolit,0.222222,3.0
3,good,Shengavit,Sharur St,4,132.0,1,stone,0.750000,3.2
4,good,Malatia-Sebastia,Z.Andranik St,1,78.0,1,panel,0.666667,2.8
...,...,...,...,...,...,...,...,...,...
4495,newly repaired,Arabkir,Mamikoniants St,4,120.0,1,stone,1.000000,3.0
4496,newly repaired,Avan,Narekatsi district,3,85.0,1,panel,0.928571,2.8
4497,good,Arabkir,Arabkir 19 St,3,91.0,1,panel,0.692308,2.8
4498,newly repaired,Malatia-Sebastia,Raffi St,3,63.0,1,panel,0.600000,2.8


In [11]:
#concatenate non-categorical columns of X_train and transformed dataframe
X_train = pd.concat([X_train[["num_rooms", "num_bathrooms", "area", "floor", "ceiling_height"]], transformed], axis=1)

In [12]:
#transformation of X_test for further testing 
X_test['floor'] = X_test['floor']/X_test['max_floor']
X_test.drop('max_floor', axis=1, inplace=True)
pretest = ohe.transform(X_test[X_test.columns[[0, 1, 2, 6]]])
pretest = pd.DataFrame(pretest, columns=columns)
X_test['index'] = range(0, len(pretest))
X_test.set_index('index', inplace=True)
X_test = pd.concat([X_test[['num_rooms', "num_bathrooms", "area", "floor", "ceiling_height"]], pretest], axis=1)
X_test

Unnamed: 0,num_rooms,num_bathrooms,area,floor,ceiling_height,good,newly repaired,zero condition,Achapnyak,Arabkir,...,Yerznkyan St,Z. Sarkavag 3 dead end,Z. Sarkavag St,Z.Andranik St,Zakyan St,Zavaryan St,monolit,other,panel,stone
0,2,1,70.0,0.187500,2.8,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3,1,91.0,0.666667,2.8,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,3,1,92.0,0.400000,2.8,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,1,83.0,0.333333,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,2,136.0,0.400000,3.2,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,3,1,75.0,1.000000,3.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
497,3,2,115.0,0.785714,3.2,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
498,3,1,86.0,0.400000,2.8,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
499,3,1,85.0,0.555556,2.8,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Lasso Regression without scaling

In [None]:
lasso = Lasso()
search = GridSearchCV(estimator=lasso, param_grid={'alpha': np.logspace(-5, 3, 10)}, 
                      scoring='r2', n_jobs=1, cv=10)
search.fit(X_train, y_train)

In [None]:
alpha_lasso = search.best_params_['alpha']
print(alpha_lasso)
print(search.best_score_)

Ridge Regression without scaling data

In [16]:
ridge = Ridge()
r_search = GridSearchCV(estimator=ridge, cv=10, param_grid={'alpha': np.logspace(-5, 3)}, scoring='r2', n_jobs=1)
r_search.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=Ridge(), n_jobs=1,
             param_grid={'alpha': array([1.00000000e-05, 1.45634848e-05, 2.12095089e-05, 3.08884360e-05,
       4.49843267e-05, 6.55128557e-05, 9.54095476e-05, 1.38949549e-04,
       2.02358965e-04, 2.94705170e-04, 4.29193426e-04, 6.25055193e-04,
       9.10298178e-04, 1.32571137e-03, 1.93069773e-03, 2.81176870e-03,
       4.09491506e-03, 5.96362332e-03, 8.68511...
       8.28642773e-02, 1.20679264e-01, 1.75751062e-01, 2.55954792e-01,
       3.72759372e-01, 5.42867544e-01, 7.90604321e-01, 1.15139540e+00,
       1.67683294e+00, 2.44205309e+00, 3.55648031e+00, 5.17947468e+00,
       7.54312006e+00, 1.09854114e+01, 1.59985872e+01, 2.32995181e+01,
       3.39322177e+01, 4.94171336e+01, 7.19685673e+01, 1.04811313e+02,
       1.52641797e+02, 2.22299648e+02, 3.23745754e+02, 4.71486636e+02,
       6.86648845e+02, 1.00000000e+03])},
             scoring='r2')

In [17]:
alpha_ridge = r_search.best_params_['alpha']
print(alpha_ridge)
print(r_search.best_score_)

1.6768329368110066
0.7852832633386753


Elastic Net without scailing

In [None]:
elnet = ElasticNet()
elnet_search = GridSearchCV(estimator=elnet, cv=10, param_grid={'alpha': np.logspace(-5, 4, 10)}, n_jobs=1)
elnet_search.fit(X_train, y_train)

In [None]:
alpha_elnet = elnet_search.best_params_['alpha']
print(alpha_elnet)
print(elnet_search.best_score_)

#### Scailing

In [None]:
norm = StandardScaler().fit(X_train)

In [None]:
scaled_X_train = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))

In [None]:
scaled_X_test = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))

Lasso Regression with scaled data

In [None]:
scaled_lasso = Lasso()
search = GridSearchCV(estimator=scaled_lasso, param_grid={'alpha': np.logspace(-5, 3, 10)}, n_jobs=1, cv=10)
search.fit(scaled_X_train, y_train)

In [None]:
alpha_scaled_lasso = search.best_params_['alpha']
print(alpha_scaled_lasso)
print(search.best_score_)

Ridge regression with scaled data

In [None]:
scaled_ridge = Ridge()
search = GridSearchCV(estimator=scaled_ridge, param_grid={'alpha': np.logspace(-5, 4)}, n_jobs=1, cv=10)
search.fit(scaled_X_train, y_train)

In [None]:
alpha_scaled_ridge = search.best_params_['alpha']
print(scaled_alpha_ridge)
print(search.best_score_)

Elastic Net with scaled data

In [None]:
scaled_elnet = ElasticNet()
elnetscaled_search = GridSearchCV(estimator=scaled_elnet, param_grid={'alpha': np.logspace(-5, 3, 10)}, cv=10)
elnetscaled_search.fit(scaled_X_train, y_train)

In [None]:
alpha_scaled_elnet = elnetscaled_search.best_params_['alpha']
print(alpha_scaled_elnet)
print(elnetscaled_search.best_score_)

Testing 3 best models

In [22]:
ridge = Ridge(alpha=alpha_ridge)
ridge.fit(X_train, y_train)
print(r2_score(y_test, ridge.predict(X_test)))
print(mean_squared_error(y_test, ridge.predict(X_test), squared=False))
print(alpha_ridge)

0.7838937180402944
24561.19286456346
1.6768329368110066


In [None]:
elnet = ElasticNet(alpha=elnet_alpha)
elnet.fit(X_train, y_train)
print(r2_score(y_test, elnet.predict(X_test)))

In [None]:
lasso = Lasso(alpha=alpha1)
lasso.fit(X_train, y_train)
print(r2_score(y_test, lasso.predict(X_test)))

Save best model and encoder trained on training dataset.
As final model I chose Lasso

In [None]:
filename = 'house_pricing_model.sav'
pickle.dump(lasso, open(filename, mode='wb'))

In [None]:
name = 'one_hot_encoding.sav'
pickle.dump(ohe, open(name, mode='wb'))