In [24]:
from src.modelling import MLWorkflow
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from src.preparation import Preprocessor
import warnings
warnings.filterwarnings('ignore')

We will evaluate the performance on our training set, firstly taking 2 ML algorithms into account: Linear Regression and DecisionTreeRegressor. I think DecisionTreeRegressor, will have a better performance, since there were a lot of non-linear relationships between features and our target variable.

In [25]:
# Importing data
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [26]:
# Perform predictions and choosing the best model
components = {'X_train':X_train,
              'X_test':X_test,
              'y_train':y_train,
              'y_test':y_test}

In [27]:
components['X_train']

Unnamed: 0,property_type,surface_total_in_m2,borough,lat,lon
0,house,,Álvaro Obregón,19.346749,-99.233710
1,apartment,,Benito Juárez,19.370046,-99.146155
2,apartment,,Iztacalco,19.410307,-99.065399
3,apartment,,Álvaro Obregón,19.366302,-99.193367
4,apartment,49.0,Tláhuac,19.299023,-99.043647
...,...,...,...,...,...
17053,apartment,238.0,Cuajimalpa de Morelos,19.360375,-99.270396
17054,apartment,143.0,Benito Juárez,19.394358,-99.177444
17055,house,60.0,Benito Juárez,19.358803,-99.160426
17056,house,,Álvaro Obregón,19.347357,-99.254654


In [28]:
# Loading the pipeline
preprocessor = Preprocessor()
preprocessor_pipeline = preprocessor.load_preprocessor('../serials/preprocessor_1.pkl')

In [29]:
ml_work_flow = MLWorkflow()
# Build the full pipeline
regressor_full_pipeline = ml_work_flow.build_modelling_pipeline(preprocessor_pipeline, 'linear_reg',LinearRegression())

In [30]:
MLWorkflow.return_loss(regressor_full_pipeline, components, process='train')

1.1004493018584096

In [31]:
decision_trees_full_pipeline = ml_work_flow.build_modelling_pipeline(preprocessor_pipeline, 'decision_tree',DecisionTreeRegressor())
# returning training loss
MLWorkflow.return_loss(decision_trees_full_pipeline, components, process='train')

0.08732369937564363

The mean squared error is too small, and this is a good indicator, but it is much more likely that the model has overfit the data. We don't want to yet touch the test set, until we launch a model we are confident about, so we need to split our training set into training and validation set,to perform hyperparameter tuning or fine tuning in those models choosen. We will use several tuning methods, such as cross validation score, GridSearch or RandomizedSearchCV.

In [32]:
#Seeing the results of cross validation score, to understand more closely the insights
from src.modelling import Tuner

cross_val_score_decision_trees = MLWorkflow.cross_validation(decision_trees_full_pipeline,
                                         components['X_train'],
                                         components['y_train'])

In [33]:
# checking the results (counting 10 folds)
pd.Series(cross_val_score_decision_trees).describe()

count    10.000000
mean      0.756770
std       0.023974
min       0.716744
25%       0.738201
50%       0.762332
75%       0.775335
max       0.786310
dtype: float64

Results obtained, indicate there is clearly a case of overfitting, with a very good training mse of 0.0005 and a validation mse of 0.05 in average. Let's see what has happened in case of Linear Regression, since in general Decision trees are models, that are easily prone to overfit.Let's try Linear Regression approach, in which we had a training score of 0.07

In [34]:
cross_val_score_linear_regression = Tuner.cross_validation(regressor_full_pipeline,
                                                           components['X_train'],
                                                           components['y_train'])

In [35]:
pd.Series(cross_val_score_linear_regression).describe()

count    10.000000
mean      1.052482
std       0.018824
min       1.019067
25%       1.046264
50%       1.053849
75%       1.063719
max       1.082322
dtype: float64

Surprised by the linear regression performance, we can see that it performs way better, with a mean validation score being 0.08, very near to 0.07. However, we will have to further improve the models, by fine-tuning them. Now we can also take into consideration using Random Forest, an ensemble model, made of several decision trees and the final score, is nothing but the mean score of all these decision trees scores. Ensemble models are less likely than decision trees to overfit, so let's test this in case of Random Forest.

In [36]:
random_forest_full_pipeline = ml_work_flow.build_modelling_pipeline(preprocessor_pipeline,
                                                                    'random_forest',
                                                                    RandomForestRegressor(random_state=42))

In [37]:
MLWorkflow.return_loss(random_forest_full_pipeline, components, process='train')

0.13127965846746426

In [38]:
cross_val_score_random_forest = Tuner.cross_validation(random_forest_full_pipeline,
                                                           components['X_train'],
                                                           components['y_train'])

In [39]:
pd.Series(cross_val_score_random_forest).describe()

count    10.000000
mean      0.658400
std       0.020185
min       0.629453
25%       0.644441
50%       0.655686
75%       0.668090
max       0.699574
dtype: float64

In [17]:
from src.modelling import RANDOM_FOREST_GS

In [18]:
tuner = Tuner(random_forest_full_pipeline, RANDOM_FOREST_GS)

In [19]:
results_grid_search = tuner.grid_search_tune(components['X_train'],
                       components['y_train'])

In [20]:
results_grid_search

({'preprocessing__svd__n_components': 10,
  'preprocessing__transformer__lat_lon__cluster_similarity__n_clusters': 15,
  'random_forest__max_features': 6},
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 42       5.138352      0.050913         0.066325        0.000555   
 43       6.658514      0.113043         0.065792        0.000161   
 44       8.218420      0.245496         0.065808        0.000632   
 36       5.136039      0.064560         0.065526        0.000631   
 38       6.641528      0.122404         0.065433        0.000276   
 37       6.645819      0.110891         0.065757        0.000681   
 32       5.246269      0.100850         0.066622        0.002336   
 31       5.149628      0.142851         0.065728        0.001808   
 30       5.198498      0.148467         0.064928        0.000516   
 39       5.068030      0.070109         0.084922        0.042096   
 25       5.052405      0.046765         0.069105        0.009095   
 26       6.5842

When the hyperparameter space is large, there is another possibility: to use RandomizedSearchCV. Instead of trying out all possible combinations it evaluates a fixed number of combinations, selecting a random value for each hyperparameter at every iteration. We will use RandomizedSearchScore for finally tuning our random forest model and then we will get this final model.

In [42]:
from src.modelling import RANDOM_FOREST_RS

In [43]:
tuner = Tuner(random_forest_full_pipeline, RANDOM_FOREST_RS)

final_model = tuner.randomized_search_tune(components['X_train'],
                                                         components['y_train'],
                                                         model_name='random_forest',
                                                         transformer='transformer')

We will use 2 approaches, now that we are ready to test the model's performance on the test data. First, since truncated svd yielded very good results, we will try to see in inference mode how it works, and then we will also try to create a preprocessing pipeline and perform the same steps as before, by keeping the original features, so not applying dimensionality reduction. We will do this for sakes of visualization of the highest important features in predicting the house prices.

In [44]:
final_model #the final pipeline

In [45]:
# test mse score
print(MLWorkflow.return_loss(final_model, components, process='test'))

# cross val score for the fine-tuned model
cross_val_score_rf_fine_tuned = Tuner.cross_validation(final_model,
                                                           components['X_train'],
                                                           components['y_train'])

0.36510121919687283


In [46]:
pd.Series(cross_val_score_rf_fine_tuned).describe()

count    10.000000
mean      0.623764
std       0.018755
min       0.598829
25%       0.607825
50%       0.622662
75%       0.637922
max       0.655653
dtype: float64

We can see that the model, gained a very low to the cross val score. We will keep this model for now, do some inference testing and then repeat the process, with the original features, not applying dimensionality reduction.

In [47]:
ml_work_flow.save_model(final_model, '../serials/regression_model.pkl')