In [1]:
# auto reload libraries (you do need to re-import libraries if you make changes)
%load_ext autoreload
%autoreload 2

# base 
import pandas as pd
import numpy as np
from pprint import pprint

# preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import TruncatedSVD


# models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# own defined functions/classes 
from preprocessing.preprocess_pipe import MakeLowerCase
from preprocessing.preprocess_pipe import HdResolutionCategorizer
from preprocessing.preprocess_pipe import StorageCategorizer
from preprocessing.preprocess_pipe import SsdCategorizer

from preprocessing.preprocess_pipe import print_missing
from preprocessing.preprocess_pipe import calculate_perf
from preprocessing.preprocess_pipe import custom_scoring_func


In [2]:
# global parameters
pd.set_option('display.max_columns', 500)

## TODO
- Add custum scoring metric: **DONE**
- further integrate preprocessing steps from Arnaud **DONE**
- Integerate steps from Victor **Partly DONE**
- Integrate steps from Bram 
- Integrate custum scoring metric (also when performing the cross validation/grid search) **DONE**
- Make function that checkts whether prediction of the maximum price are >= predictions of the minimum price
- Investigate why there is such a huge difference between the performance on 'our test set' and the performance of submission test set
- Implement better missing values imputations methods
- Add some post processing visualizations:
    - feature importance plots
    - look at our predictions visually (do the make since?)
    - look at the residuals
    - Have a look at this especially section 5 for model interpretability ideas
    https://christophm.github.io/interpretable-ml-book/pdp.html

# Step 1: import data and split in train and validation set
The validation set is more our own kind of test set

In [4]:
# read in trainig and validation data
# use the same data split as we did in R
df_all_train = pd.read_csv("../data/train.csv", sep=',')
df_test = pd.read_csv("../data/test.csv", sep=',')

print(f'Dimensions of all training data {df_all_train.shape}')
print(f'Dimension test data {df_test.shape}')

Dimensions of all training data (510, 22)
Dimension test data (222, 20)


In [5]:
# split in training and validation set
df_train = df_all_train.sample(frac=0.75, random_state=0, replace=False)
df_val = df_all_train.drop(df_train.index)

# reset index, if you don't resit missing rows get inserted in the pipeline
# see: https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
df_train = df_train.reset_index().drop('index',axis=1)
df_val = df_val.reset_index().drop('index',axis=1)

# Step 2: define pipeline

The pipeline should handle all steps performed on the data from data cleaning till the acutal model prediction. 
It should also be fairly easily to apply this to the test data. Therefore we make use of the scikit learn pipeline which allows all of this functionality. You can also implement your own `custom` transformations.

For more information on how to use and why using pipelines see:
 - https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf 
 - scikit learn api on pipelines: https://scikit-learn.org/stable/modules/compose.html

We make a difference between the pre processing steps for
- numerical features
- categorical features

It's important to note that if a feature is in the numerical feature pipeline it can't be in the categorical feature pipeline. Also if some preprocessing steps in the numerical feature pipeline need features from the categorical pipeline you will get an error. All preprocessing steps in each pipeline should be able to perform the transformation only with the categorical or numerical features. 

You can implement some custom transformation steps like 'MakeLowerCase' ==> see preprocessing.preprocess_pipe for more details or check this article  these two article (also gives some explanation on how to implement pipelines)

- https://gist.github.com/amberjrivera/8c5c145516f5a2e894681e16a8095b5c

In [6]:
#Numerical features to pass down the numerical pipeline 
numerical_features = ['screen_size' ,'pixels_x','pixels_y',
                      'ram', 'weight','ssd','storage']

#Categrical features to pass down the categorical pipeline 
categorical_features = ['brand','base_name', 'screen_surface','touchscreen',
                        'cpu','cpu_details','detachable_keyboard',
                        'discrete_gpu','gpu', 'os','os_details']

# define all unique features
features = np.unique(numerical_features).tolist() + np.unique(categorical_features).tolist()

# target variables
target = ['min_price','max_price']

In [7]:
# train
X_train = df_train[features]
y_train = df_train[target]

# validation (this is kind our own test set)
X_val  = df_val[features]
y_val = df_val[target]

# train_validation (this is all training data we have) for fitting the model
X_all_train = df_all_train[features]
y_all_train = df_all_train[target]

# test
X_test = df_test[features]

In [8]:
# you can add many more and 
# you can even define custom preprocessing steps like 'MakeLowerCase()'

# pipeline  uses only numerical features,
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    #('imputer', KNNImputer(n_neighbors=5, weights='uniform',metric='nan_euclidean')),
    ('imputer', IterativeImputer(missing_values=np.nan, random_state=0, n_nearest_features=None, sample_posterior=True)),
    #('transformation', PowerTransformer(method='yeo-johnson',standardize=False)),
     #Scale features using statistics that are robust to outliers.
    ('scaler', RobustScaler(with_centering=True, with_scaling=True, quantile_range=(10.0, 90.0)))]) 

# pipeline use only categorical features
categorical_transformer = Pipeline(steps=[ 
    ('lowercase', MakeLowerCase()), # lower cases all columns containing strings
    #('sd_category' ,SsdCategorizer(drop_original_feature=True)),
    #('storage_category', StorageCategorizer(drop_original_feature=True)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# add both preprocessing pipelines in one pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
# this is how the preporcessed pipeline looks like (just to have an idea)
pd.DataFrame(preprocessor.fit_transform(X_train)).shape

(382, 1)

# Step 3: add models to pipeline
## 1) Random Forest

### A) Training and parameter tuning

In [10]:
# define model: I just add some default parameters but you could
# also just write: RandomForestRegressor() since we will perform a grid search 
# to find good hyperparameter values
model_rf = RandomForestRegressor(random_state=1, criterion='mae', bootstrap=True)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model_rf.get_params())

# add to pipeline
pipeline_rf = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_rf)])


# add transformation on the target variable, by default power transformation 
# also performs standardization after performing the power transformation
# and back transform to the original space when outputting predictions 
transformer_target = PowerTransformer(method='yeo-johnson',standardize=True)
pipeline_rf_update = TransformedTargetRegressor(regressor=pipeline_rf, 
                                         transformer=transformer_target)

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mae',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [26]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 4000, num = 20)]

# The number of features to consider when looking for the best split:
# - If “auto”, then max_features=n_features.
# - If “sqrt”, then max_features=sqrt(n_features).
# - If “log2”, then max_features=log2(n_features).
# - If None, then max_features=n_features
max_features = ['auto', 'sqrt','log2',None]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 200, num = 20)]
max_depth.append(None)

# The function to measure the quality of a split
criterion = ['mse','mae']

# Minimum number of samples required to split a node
# - If int, then consider min_samples_split as the minimum number.
# - If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
min_samples_split = [0.001, 0.01, 0.025, 0.05]
# Minimum number of samples required at each leaf node

# - If int, then consider min_samples_leaf as the minimum number.
# - If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.
min_samples_leaf = [0.001, 0.01, 0.025, 0.05,]

# Create the random grid
random_grid_rf = {
   'regressor__regressor__n_estimators': n_estimators,
   'regressor__regressor__max_features': max_features,
   'regressor__regressor__max_depth': max_depth,
   'regressor__regressor__criterion': criterion,
   'regressor__regressor__min_samples_split': min_samples_split,
   'regressor__regressor__min_samples_leaf': min_samples_leaf}

#### Helpful articles:
 - custom scoring metric: https://stackoverflow.com/questions/48468115/how-to-create-a-customized-scoring-function-in-scikit-learn-for-scoring-a-set-of
 - random parameter search: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
you can also use the traditional grid search in pyhton, but I prefer the randomizedSearch
 - Once the optimal parameters are found for the model, you don't need to run this again

In [21]:
# define random search (and narrow down time grid search)
rf_random_search = RandomizedSearchCV(
   estimator = pipeline_rf_update, 
   param_distributions = random_grid_rf, n_iter = 20,  
   cv = 10, verbose=2, random_state=1, n_jobs = -1, refit=True,
   scoring=make_scorer(custom_scoring_func, greater_is_better=False)
)


# run grid search and refit with best hyper parameters
rf_random_search.fit(X_train, y_train)  
print(rf_random_search.best_params_)    
print(rf_random_search.best_score_)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter n_estimators for estimator TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
                           regressor=Pipeline(memory=None,
                                              steps=[('preprocessor',
                                                      ColumnTransformer(n_jobs=None,
                                                                        remainder='drop',
                                                                        sparse_threshold=0.3,
                                                                        transformer_weights=None,
                                                                        transformers=[('num',
                                                                                       Pipeline(memory=None,
                                                                                                steps=[('imputer',
                                                                                                        IterativeImputer(add_indicator=False,
                                                                                                                         estimator=None,
                                                                                                                         imputation_order=...
                                                                            max_features='auto',
                                                                            max_leaf_nodes=None,
                                                                            max_samples=None,
                                                                            min_impurity_decrease=0.0,
                                                                            min_impurity_split=None,
                                                                            min_samples_leaf=1,
                                                                            min_samples_split=2,
                                                                            min_weight_fraction_leaf=0.0,
                                                                            n_estimators=100,
                                                                            n_jobs=None,
                                                                            oob_score=False,
                                                                            random_state=1,
                                                                            verbose=0,
                                                                            warm_start=False))],
                                              verbose=False),
                           transformer=PowerTransformer(copy=True,
                                                        method='yeo-johnson',
                                                        standardize=True)). Check the list of available parameters with `estimator.get_params().keys()`.

- **TODO make a plot that visualizes hyperparameters (maybe kind of heatmap)**
- Once we have found good tuning parameters write them down so we don't need to redo this step over and over

In [15]:
# have look at the best hyperparameters and their respective performance (maybe also look at the sd)
pd.DataFrame(rf_random_search.cv_results_).sort_values(
    by=['mean_test_score'],ascending=False).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__regressor__n_estimators,param_regressor__regressor__min_samples_split,param_regressor__regressor__min_samples_leaf,param_regressor__regressor__max_features,param_regressor__regressor__max_depth,param_regressor__regressor__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,31.905521,2.291826,0.673621,0.081532,2710,0.01,0.01,,29,mse,"{'regressor__regressor__n_estimators': 2710, '...",-271.807497,-491.544084,-300.191495,-259.885802,-346.331515,-437.885004,-310.70021,-404.912016,-326.86104,-224.964401,-337.508306,79.78482,1
3,160.234444,2.397188,0.498756,0.125669,1789,0.001,0.01,auto,143,mae,"{'regressor__regressor__n_estimators': 1789, '...",-265.614472,-514.263427,-297.168201,-285.780397,-344.010039,-450.697282,-312.054909,-403.668284,-334.922721,-212.709695,-342.088943,86.002784,2
7,59.629014,2.309821,0.264329,0.039237,1052,0.2,0.01,,200,mae,"{'regressor__regressor__n_estimators': 1052, '...",-347.097384,-601.049926,-326.20448,-377.856443,-394.21307,-501.337968,-397.639881,-435.187274,-380.328694,-281.407223,-404.232234,86.587846,3
5,80.65795,4.263683,1.195949,0.426722,4000,0.025,0.1,,171,mae,"{'regressor__regressor__n_estimators': 4000, '...",-404.774388,-614.720666,-378.318576,-454.197086,-485.439579,-511.058218,-371.93277,-492.081397,-439.081769,-361.057415,-451.266186,74.294994,4
9,43.493094,3.449362,0.639099,0.147347,1973,0.025,0.1,auto,143,mae,"{'regressor__regressor__n_estimators': 1973, '...",-405.08796,-615.673623,-378.203569,-453.525028,-484.163255,-510.721119,-372.306905,-492.854036,-441.099973,-362.349818,-451.598529,74.226056,5


### B) Performance on validation data

In [16]:
calculate_perf(y_val, rf_random_search.predict(X_val))

{'minimum price': 122.62542422255532,
 'maximum price': 125.07706505859072,
 'total error': 247.70248928114603}

### C) Post processing

 - inspect predictions/residuals (make visualisations) (See Bram)
 - feature importance (see Bram)

### D) Predictions test data

Refit on all training data (using the parameters found on the random search) and submit prediction

In [19]:
# train your final model on all data with best parameters 
model_rf_final = RandomForestRegressor(
     **rf_random_search.best_params_,
     bootstrap=True,
     n_jobs=-1
)
# add to pipeline
pipeline_rf_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_rf_final)],
              verbose=True)

# again add transformer for target variable
pipeline_rf_final = TransformedTargetRegressor(regressor=pipeline_rf_final, 
                                         transformer=transformer_target)

# fit final model on all training data we have at hand
pipeline_rf_final = pipeline_rf_final.fit(X_all_train, y_all_train)

TypeError: __init__() got an unexpected keyword argument 'regressor__regressor__n_estimators'

In [131]:
# performance on all data where the model was fit one (should be very low)
calculate_perf(pipeline_rf_final.predict(X_all_train), y_all_train)

{'minimum price': 4.037629680842831,
 'maximum price': 4.20369025192588,
 'total error': 8.241319932768711}

In [19]:
# make predictions on test data
rf_pred_test = pipeline_rf_final.predict(X_test)

# submission format
rf_submission_format = pd.DataFrame.from_dict(
 {'ID':df_test['id'].values,
 'MIN':rf_pred_test[:,0],
 'MAX':rf_pred_test[:,1]}).set_index('ID')

# write to csv
rf_submission_format.to_csv('../output/predictions_test/rf_python.csv' ,
                            header=True, index=True, sep=',')

## 2) Gradient Boosting

## 3) Support Vector Machine

## 4) Penalized Linear regression (Lasso, Ridge, Elastic net)