In [1]:
import pandas as pd
import numpy as np
import plotly.express as px 

In [2]:
df = pd.read_csv("../../DataSet/rond.ir_full_preprocessed.csv")

In [4]:
#df.drop('phone_number' , axis=1 , inplace=True)

In [None]:
df["price"] = np.log1p(df["price"])

### Add Model-based features

#### Split data

In [4]:
from sklearn.model_selection import train_test_split
X = df.drop('price',axis=1)
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Validate split

In [6]:
import scipy.stats as stats
stats.ttest_ind(a=y_train, b=y_test, equal_var=False)

Ttest_indResult(statistic=0.9445913766067343, pvalue=0.3448719406276619)

In [9]:
from sklearn.neighbors import KNeighborsRegressor
first_three = X_train[['first_three_0', 'first_three_1', 'first_three_2']]
last_four = X_train[['last_four_0', 'last_four_1','last_four_2', 'last_four_3']]
price = y_train

In [18]:
from sklearn.model_selection import validation_curve
k_range = range(8,30)
train_scores, test_scores = validation_curve(KNeighborsRegressor(),
                 X=first_three,
                 y=price,
                 param_name='n_neighbors',
                 param_range=k_range,
                 scoring='neg_mean_squared_error',
                 n_jobs=-1)


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [19]:
fig_k = px.scatter(x=list(k_range),y=-1 * train_scores.mean(axis=1))
fig_k.add_scatter(x=list(k_range),y=-1 * test_scores.mean(axis=1))
fig_k.show()

### Seems like K=27 is ok!

In [21]:
train_scores, test_scores = validation_curve(KNeighborsRegressor(),
                 X=last_four,
                 y=price,
                 param_name='n_neighbors',
                 param_range=k_range,
                 scoring='neg_mean_squared_error',
                 n_jobs=-1)
fig_k = px.scatter(x=list(k_range),y=-1 * train_scores.mean(axis=1))
fig_k.add_scatter(x=list(k_range), y=-1 * test_scores.mean(axis=1))
fig_k.show()

### We choose K=20 for `last_four`

In [23]:
knn_first_three = KNeighborsRegressor(n_neighbors=27).fit(first_three, price)
knn_last_four = KNeighborsRegressor(n_neighbors=20).fit(last_four, price)
X_train['price_first_three_knn'] = knn_first_three.predict(first_three)
X_train['price_last_four_knn'] = knn_last_four.predict(last_four)
X_test['price_first_three_knn'] = knn_first_three.predict(X_test[['first_three_0', 'first_three_1', 'first_three_2']]) 
X_test['price_last_four_knn'] = knn_last_four.predict(X_test[['last_four_0', 'last_four_1','last_four_2', 'last_four_3']]) 

# Modeling

## Import libraries

In [24]:
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

## Base models

### Lasso

In [25]:
alpha_range = [0.0001 , 0.0003 , 0.001 , 0.003 , 0.01 , 0.03 , 0.1 , 0.3 , 1]

lasso = Lasso(random_state=42)
lasso_grid = GridSearchCV(estimator=lasso,
                          param_grid={'alpha':alpha_range},
                          scoring='neg_mean_squared_error',
                          cv=5,
                          n_jobs=-1)

lasso_grid.fit(X_train,y_train)
print(f'Best score of Lasso : {-1 * lasso_grid.best_score_}')

Best score of Lasso : 1.1052229107831102


In [26]:
coefs = lasso_grid.best_estimator_.coef_
px.bar(x=X_train.columns, y=coefs)

## Decision tree

In [27]:
decision_tree = DecisionTreeRegressor(max_depth=100 , max_features = 5 , min_samples_leaf=5 ,
                           min_samples_split = 10, random_state=42)
scores = cross_val_score(decision_tree,X_train,y_train,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1) 
print(f"Desicion Tree Score : mean={-1 * np.mean(scores)} std={np.std(scores)}")

Desicion Tree Score : mean=0.5445981840007759 std=0.046809415900496205


In [28]:
decision_tree.fit(X_train,y_train)
dt_preds = decision_tree.predict(X_test)
print(f"MSE Decision Tree on Test set : {mean_squared_error(y_test,dt_preds)}")

MSE Decision Tree on Test set : 0.5494878622940943


In [29]:
px.bar(x=X_train.columns , y=decision_tree.feature_importances_)

#### Tune Hyperparameters

In [30]:
param_grid = {"min_samples_split": [2, 4 , 7 , 10 , 14 , 18, 21, 25, 30],
              "max_depth": range(50,150,5),
              "min_samples_leaf": range(2,30,2),
              "max_features":range(15,62,5)
              }


random_cv_dtm = RandomizedSearchCV(estimator=DecisionTreeRegressor(random_state=42),
                                   param_distributions=param_grid,
                                   scoring='neg_mean_squared_error',
                                   n_iter=70,
                                   cv=10,
                                   n_jobs=-1)

random_cv_dtm.fit(X_train,y_train)


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [31]:
print(f"Decision tree Best score in random search : {-1 * random_cv_dtm.best_score_}")
print(f"Decision tree best params : {random_cv_dtm.best_params_}")

Decision tree Best score in random search : 0.20950146942331563
Decision tree best params : {'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 40, 'max_depth': 60}


In [32]:
best_tree = random_cv_dtm.best_estimator_
best_tree.fit(X_train, y_train)
best_tree_preds = best_tree.predict(X_test)
print(f"MSE best Decision Tree on Test set : {mean_squared_error(y_test,best_tree_preds)}")

MSE best Decision Tree on Test set : 0.19921608387996742


In [33]:
px.bar(x=X_train.columns , y=best_tree.feature_importances_)

## Random Forest

#### RF with default hyperparameters 

In [17]:
rf_default = RandomForestRegressor(n_jobs=-1)
scores = cross_val_score(rf_default,X_train,y_train,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1) 
print(f"Random Forest Score : mean={-1 * np.mean(scores)} std={np.std(scores)}")

Random Forest Score : mean=0.11235237874427657 std=0.0016712433379419192


In [16]:
rf_default = RandomForestRegressor(n_jobs=-1)
rf_default.fit(X_train, y_train)
rf_def_preds = rf_default.predict(X_test)
print(f"MSE default RF on Test set : {mean_squared_error(y_test,rf_def_preds)}")

MSE default RF on Test set : 0.10665510407884142


#### RF with decision tree best parameters

In [18]:
rf_best_params = RandomForestRegressor(min_samples_split= 21,
                                       min_samples_leaf= 2, 
                                       max_features= 40,
                                       max_depth= 75,
                                       n_jobs=-1)
rf_best_params.fit(X_train, y_train)
rf_best_preds = rf_best_params.predict(X_test)
print(f"MSE best RF on Test set : {mean_squared_error(y_test,rf_best_preds)}")

MSE best RF on Test set : 0.12517494973525817


#### Tune `n_estimators` for RF

In [None]:
from sklearn.model_selection import validation_curve
rf_best_params = RandomForestRegressor(min_samples_split= 21,
                                       min_samples_leaf= 2, 
                                       max_features= 40,
                                       max_depth= 75,
                                       n_jobs=-1)
n_estimators_values = range(150,300,50)
train_scores, test_scores = validation_curve(rf_best_params,
                 X=X_train,
                 y=y_train,
                 param_name='n_estimators',
                 param_range=n_estimators_values,
                 scoring='neg_mean_squared_error',
                 n_jobs=-1)

In [26]:
fig_k = px.bar(x=list(n_estimators_values),y=-1 * train_scores.mean(axis=1))
fig_k.add_bar(x=list(n_estimators_values),y=-1 * test_scores.mean(axis=1),text='validation')
fig_k.show()