In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV

In [17]:
train_data = pd.read_csv("train1_clustered.csv", header=0, index_col=0)
train_data.head()

Unnamed: 0,pca_component 0,pca_component 1,pca_component 2,pca_component 3,pca_component 4,pca_component 5,pca_component 6,pca_component 7,pca_component 8,pca_component 9,pca_component 10,price,price_category,cluster
0,111.257356,-19.034748,4.862769,5.177924,4.656782,-1.921391,-0.183095,-0.501683,0.228178,0.155032,-0.04554,42.0,low,2
1,83.044684,-16.618402,-1.045081,-27.007275,1.369359,-1.257543,1.601341,-0.07047,0.348136,0.057967,-0.059666,70.0,low,2
2,-40.735853,-16.486288,8.230134,2.939946,-0.154164,3.420576,-1.148351,0.267577,-0.051632,0.067243,-0.002725,120.0,medium,0
3,52.478668,-20.196244,19.246323,40.140551,6.309285,2.801438,-0.92349,0.266367,-0.188993,0.018491,0.047119,120.0,medium,2
4,-2.510515,-18.811422,-2.803176,16.615282,-20.676484,2.069589,-0.20931,0.216395,-0.523536,-0.190009,0.008872,120.0,medium,1


In [18]:
train_x = train_data.drop(['price', 'cluster', 'price_category'], axis=1)
train_category = train_data['cluster']

In [19]:
test_data = pd.read_csv('test1.csv', header=0, index_col=0)
test_data.head()

Unnamed: 0,pca_component 0,pca_component 1,pca_component 2,pca_component 3,pca_component 4,pca_component 5,pca_component 6,pca_component 7,pca_component 8,pca_component 9,pca_component 10,price
0,108.926041,27.386655,6.421149,-14.615405,-3.498109,0.131333,-0.102821,0.474863,-0.030184,0.034442,-0.020059,185.0
1,-75.76741,29.599789,-18.121252,14.259126,4.108675,-0.400869,-0.484841,-0.533457,0.513539,-0.183396,-0.009645,65.0
2,85.211003,-18.07442,4.772577,-0.946582,5.378793,-1.65301,1.458139,-0.109237,0.222179,0.048921,-0.06103,100.0
3,-10.862716,-15.88015,3.203471,-13.939,1.920309,1.620542,2.286855,0.066728,0.680827,-0.272484,-0.062163,199.0
4,22.114263,27.336957,11.271736,10.756515,-3.451199,-0.873701,-0.72768,0.325945,0.262835,-0.080813,0.005097,170.0


In [20]:
knn = KNeighborsClassifier().fit(train_x, train_category)

In [21]:
test_category = knn.predict(test_data.drop(['price'], axis=1))

In [23]:
test_data['cluster'] = test_category

In [24]:
test_data.head()

Unnamed: 0,pca_component 0,pca_component 1,pca_component 2,pca_component 3,pca_component 4,pca_component 5,pca_component 6,pca_component 7,pca_component 8,pca_component 9,pca_component 10,price,cluster
0,108.926041,27.386655,6.421149,-14.615405,-3.498109,0.131333,-0.102821,0.474863,-0.030184,0.034442,-0.020059,185.0,2
1,-75.76741,29.599789,-18.121252,14.259126,4.108675,-0.400869,-0.484841,-0.533457,0.513539,-0.183396,-0.009645,65.0,0
2,85.211003,-18.07442,4.772577,-0.946582,5.378793,-1.65301,1.458139,-0.109237,0.222179,0.048921,-0.06103,100.0,2
3,-10.862716,-15.88015,3.203471,-13.939,1.920309,1.620542,2.286855,0.066728,0.680827,-0.272484,-0.062163,199.0,1
4,22.114263,27.336957,11.271736,10.756515,-3.451199,-0.873701,-0.72768,0.325945,0.262835,-0.080813,0.005097,170.0,1


In [27]:
original_price = []
predicted_price = []
for i in range(3):
  train = train_data[train_data['cluster'] == i]
  test = test_data[test_data['cluster'] == i]
  train_x = train.drop(['cluster', 'price', 'price_category'], axis=1)
  train_y = train['price']
  original_price.append(test['price'])
  test_x = test.drop(['cluster', 'price'], axis=1)
  rf = RandomForestRegressor()
  rf_params = {'n_estimators':[i for i in range(50, 101, 10)], 'max_depth':[i for i in range(6,11, 2)]}
  grid_rf = GridSearchCV(estimator=rf, param_grid=rf_params, n_jobs=-1).fit(train_x, train_y)
  predicted_price.append(grid_rf.predict(test_x))

In [28]:
original = np.concatenate(original_price)
predicted = np.concatenate(predicted_price)
rmse = np.sqrt(mean_squared_error(predicted, original))
mae = mean_absolute_error(predicted, original)
log_mse = mean_squared_log_error(predicted, original)
print("rmse is: ", rmse)
print("mae is: ", mae)
print("log_mse is: ", log_mse)

rmse is:  35.434480824918225
mae is:  26.89096631217333
log_mse is:  0.12596604233245537


In [29]:
original_price = []
predicted_price = []
for i in range(3):
  train = train_data[train_data['cluster'] == i]
  test = test_data[test_data['cluster'] == i]
  train_x = train.drop(['cluster', 'price', 'price_category'], axis=1)
  train_y = train['price']
  original_price.append(test['price'])
  test_x = test.drop(['cluster', 'price'], axis=1)
  ad = AdaBoostRegressor(DecisionTreeRegressor(max_depth=10))
  ad_params = {'n_estimators':[i for i in range(50, 101, 10)], 'loss':['linear',  'exponential'], 'learning_rate':[0.5, 1]}
  grid_ad = GridSearchCV(estimator=ad, param_grid=ad_params, n_jobs=-1).fit(train_x, train_y)
  predicted_price.append(grid_ad.predict(test_x))

In [30]:
original = np.concatenate(original_price)
predicted = np.concatenate(predicted_price)
rmse = np.sqrt(mean_squared_error(predicted, original))
mae = mean_absolute_error(predicted, original)
log_mse = mean_squared_log_error(predicted, original)
print("rmse is: ", rmse)
print("mae is: ", mae)
print("log_mse is: ", log_mse)

rmse is:  36.612450676084144
mae is:  28.720934080211617
log_mse is:  0.14253645477671623
