In [6]:
# auto reload libraries (you do need to re-import libraries if you make changes)
%load_ext autoreload
%autoreload 2

# base 
import pandas as pd
import numpy as np
from pprint import pprint

# preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer

# models
from sklearn.ensemble import RandomForestRegressor


# own defined classes/functions
from preprocessing.preprocessing import MakeLowerCase
from preprocessing.preprocessing import print_missing
from preprocessing.preprocessing import calculate_perf
from preprocessing.preprocessing import custom_scoring_func

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# global parameters
pd.set_option('display.max_columns', 500)

## TODO
- Add custum scoring metric: **DONE**
- further integrate preprocessing steps from Arnaud
- Integerate steps from Victor
- fix column touchscreen (there are some mistakes)
- fix discrete_gpu
- Check screen size (some outlier values)
- pixels_y
- Integrate custum scoring metric (also when performing the cross validation/grid search)
-  Make function that checkts whether prediction of the maximum price are >= predictions of the minimum price
- Implement better missing values imputations methods
- Add some post processing visualizations:
    - feature importance plots
    - look at our predictions visually (do the make since?)
    - look at the residuals
    - Have a look at this especially section 5 for model interpretability ideas
    https://christophm.github.io/interpretable-ml-book/pdp.html

In [8]:
# read in trainig and validation data
# use the same data split as we did in R
df_train = pd.read_csv("../data/train_train.csv", sep=';')
df_val = pd.read_csv("../data/train_validation.csv", sep=';')
df_train_val = pd.read_csv("../data/train.csv", sep=';')
df_test = pd.read_csv("../data/test.csv", sep=',')

print(f'Dimensions training data {df_train.shape}')
print(f'Dimension validation data {df_val.shape}')
print(f'Dimension train and validation (all training data) data {df_train_val.shape}')
print(f'Dimension test data {df_test.shape}')

Dimensions training data (340, 22)
Dimension validation data (170, 22)
Dimension train and validation (all training data) data (510, 22)
Dimension test data (222, 20)


In [9]:
print_missing(df_train)

Unnamed: 0,missing count,missing %
screen_surface,8,2.35
cpu_details,5,1.47
weight,1,0.29
gpu,1,0.29
detachable_keyboard,1,0.29


# Define Pipeline
Make a difference between the pre processing steps for
- numerical features
- categorical features

I still don't use all features, since some extra data cleaning is needed on certain features

In [10]:
#Numerical features to pass down the numerical pipeline 
numerical_features = ['screen_size', 'pixels_x','detachable_keyboard' , 
                      'ram', 'ssd','storage', 'weight']

#Categrical features to pass down the categorical pipeline 
cateforical_features = ['brand', 'screen_surface','touchscreen', 
                        'cpu', 'pixels_y', 'discrete_gpu','gpu', 'os']

# define all features
features = numerical_features + cateforical_features

# target variables
target = ['min_price','max_price']

In [11]:
# train
X_train = df_train[features]
y_train = df_train[target]

# validation (this is kind our own test set)
X_val  = df_val[features]
y_val = df_val[target]

# train_validation (this is all training data we have) for fitting the model
X_train_val = df_train_val[features]
y_train_val = df_train_val[target]

# test
X_test = df_test[features]

In [12]:
# you can add many more and 
# you can even define custom preprocessing steps like 'MakeLowerCase()'

# pipeline  numerical features, 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# pipeline categorical features
categorical_transformer = Pipeline(steps=[
    ('lowercase', MakeLowerCase()), # lower cases all columns containing strings
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# add both preprocessing pipelines in one pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, cateforical_features)])

# Models
## 1) Random Forest

### 1.A) Training and parameter tuning

In [23]:
# define model: I just add some default parameters but you could
# also just write: RandomForestRegressor() since we will perform a grid search 
# to find good hyperparameter values
model_rf = RandomForestRegressor(random_state=1)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model_rf.get_params())

# add to pipeline
pipeline_rf = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_rf)])


# add transformation on the target variable, by default power transformation 
# also performs standardization after performing the power transformation
# and back transform to the original space when outputting predictions 
transformer_target = PowerTransformer(method='yeo-johnson',standardize=True)
pipeline_rf = TransformedTargetRegressor(regressor=pipeline_rf, 
                                         transformer=transformer_target)

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [49]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 800, stop = 4000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 20)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid_rf = {'regressor__regressor__n_estimators': n_estimators,
               'regressor__regressor__max_features': max_features,
               'regressor__regressor__max_depth': max_depth,
               'regressor__regressor__min_samples_split': min_samples_split,
               'regressor__regressor__min_samples_leaf': min_samples_leaf,
               'regressor__regressor__bootstrap': bootstrap}

## Helpful articles:
 - custom scoring metric: https://stackoverflow.com/questions/48468115/how-to-create-a-customized-scoring-function-in-scikit-learn-for-scoring-a-set-of
 - random parameter search: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [50]:
# define random search (and narrow down time grid search)
rf_random_search = RandomizedSearchCV(
                estimator = pipeline_rf, 
               param_distributions = random_grid_rf, n_iter = 20,  
               cv = 10, verbose=2, random_state=1, n_jobs = -1, refit=True,
               scoring=make_scorer(custom_scoring_func, greater_is_better=False)
)


# run grid search and refit with best hyper parameters
rf_random_search.fit(X_train, y_train)  
print(rf_random_search.best_params_)    
print(rf_random_search.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 27.8min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 31.7min finished


{'regressor__regressor__n_estimators': 2821, 'regressor__regressor__min_samples_split': 2, 'regressor__regressor__min_samples_leaf': 1, 'regressor__regressor__max_features': 'sqrt', 'regressor__regressor__max_depth': 31, 'regressor__regressor__bootstrap': False}
-303.99585255292095


- **TODO make a plot that visualizes hyperparameters (maybe kind of heatmap)**
- Once we have found good tuning parameters write them down so we don't need to redo this step over and over

In [51]:
# have look at the best hyperparameters and their respective performance (maybe also look at the sd)
pd.DataFrame(rf_random_search.cv_results_).sort_values(
    by=['mean_test_score'],ascending=False).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__regressor__n_estimators,param_regressor__regressor__min_samples_split,param_regressor__regressor__min_samples_leaf,param_regressor__regressor__max_features,param_regressor__regressor__max_depth,param_regressor__regressor__bootstrap,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
10,31.66566,1.603202,1.118415,0.380194,2821,2,1,sqrt,31,False,"{'regressor__regressor__n_estimators': 2821, '...",-222.88103,-428.844346,-238.473387,-318.520556,-346.451989,-284.93303,-282.585692,-296.525171,-320.687415,-300.055909,-303.995853,54.447381,1
7,8.789808,0.625888,0.352214,0.075838,800,2,1,sqrt,73,False,"{'regressor__regressor__n_estimators': 800, 'r...",-222.548653,-429.397604,-240.887915,-317.111389,-352.580297,-289.100281,-281.296716,-298.086278,-321.930554,-300.31617,-305.325586,54.739279,2
19,8.250233,0.332612,0.386171,0.068979,800,5,1,sqrt,52,False,"{'regressor__regressor__n_estimators': 800, 'r...",-236.95124,-435.986898,-242.990144,-324.842966,-346.281604,-268.149599,-282.966164,-298.607383,-324.16185,-304.004172,-306.494202,54.737087,3
1,61.781412,2.791092,1.107418,0.157955,2821,2,1,auto,110,True,"{'regressor__regressor__n_estimators': 2821, '...",-253.15149,-447.608041,-259.633286,-306.649343,-343.408493,-276.215412,-292.793195,-381.305741,-331.976059,-284.347724,-317.708878,57.359032,4
12,53.481692,4.943461,1.348041,0.221395,2315,5,2,auto,99,True,"{'regressor__regressor__n_estimators': 2315, '...",-259.07909,-456.684287,-270.28834,-320.987164,-353.322704,-270.601239,-300.436563,-374.524592,-345.965954,-289.298349,-324.118828,57.636489,5


### B) Performance on validation data

In [52]:
calculate_perf(y_val, rf_random_search.predict(X_val))

{'minimum price': 139.54931571067183,
 'maximum price': 145.00396316174164,
 'total error': 284.55327887241344}

## C) Post processing

 - inspect predictions/residuals (make visualisations)
 - feature importance

## D) Predictions test data

Refit on all training data (using the parameters found on the random search) and submit prediction

In [43]:
# train your final model on all data with best parameters 
model_rf_final = RandomForestRegressor(
     n_estimators=2821, 
     max_depth=31,
     max_features='sqrt',
     min_samples_split=2,
     min_samples_leaf=1,
     n_jobs=-1
)
# add to pipeline
pipeline_rf_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_rf_final)],
              verbose=True)

# again add transformer for target variable
pipeline_rf_final = TransformedTargetRegressor(regressor=pipeline_rf_final, 
                                         transformer=transformer_target)

# fit final model on all training data we have at hand
pipeline_rf_final = pipeline_rf_final.fit(X_train_val, y_train_val)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   9.3s


In [44]:
# make predictions on test data
rf_pred_test = pipeline_rf_final.predict(X_test)

In [45]:
# submission format
rf_submission_format = pd.DataFrame.from_dict(
 {'ID':df_test['id'].values,
 'MIN':rf_pred_test[:,0],
 'MAX':rf_pred_test[:,1]}).set_index('ID')

In [46]:
# write to csv
rf_submission_format.to_csv('../output/predictions_test/rf_python.csv' ,
                            header=True, index=True, sep=',')