In [1]:
# auto reload libraries (you do need to re-import libraries if you make changes)
%load_ext autoreload
%autoreload 2

# base 
import pandas as pd
import numpy as np
from pprint import pprint

# preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer

# models
from sklearn.ensemble import RandomForestRegressor

# own defined functions/classes 
from  preprocessing.preprocess_pipe import MakeLowerCase
from  preprocessing.preprocess_pipe import print_missing
from  preprocessing.preprocess_pipe import calculate_perf
from  preprocessing.preprocess_pipe import custom_scoring_func

In [2]:
# global parameters
pd.set_option('display.max_columns', 500)

## TODO
- Add custum scoring metric: **DONE**
- further integrate preprocessing steps from Arnaud
- Integerate steps from Victor
- fix column touchscreen (there are some mistakes)
- fix discrete_gpu
- Check screen size (some outlier values)
- pixels_y
- Integrate custum scoring metric (also when performing the cross validation/grid search)
-  Make function that checkts whether prediction of the maximum price are >= predictions of the minimum price
- Implement better missing values imputations methods
- Add some post processing visualizations:
    - feature importance plots
    - look at our predictions visually (do the make since?)
    - look at the residuals
    - Have a look at this especially section 5 for model interpretability ideas
    https://christophm.github.io/interpretable-ml-book/pdp.html

In [34]:
# read in trainig and validation data
# use the same data split as we did in R
df_all_train = pd.read_csv("../data/train.csv", sep=',')
df_test = pd.read_csv("../data/test.csv", sep=',')

print(f'Dimensions of all training data {df_all_train.shape}')
print(f'Dimension test data {df_test.shape}')

Dimensions of all training data (510, 22)
Dimension test data (222, 20)


In [39]:
# split in training and validation set

df_train = df_all_train.sample(frac=0.75, random_state=0)
df_val = df_all_train.drop(df_train.index)

# reset index, if you don't resit missing rows get inserted in the pipeline
# see:
df_train = df_train.reset_index()
df_val = df_val.reset_index()

In [41]:
print_missing(df_train)

Unnamed: 0,missing count,missing %
screen_surface,10,2.62
cpu_details,6,1.57
detachable_keyboard,4,1.05
weight,2,0.52
os_details,2,0.52
os,2,0.52
gpu,1,0.26


# Define Pipeline
Make a difference between the pre processing steps for
- numerical features
- categorical features

I still don't use all features, since some extra data cleaning is needed on certain features

In [42]:
#Numerical features to pass down the numerical pipeline 
numerical_features = ['screen_size', 'pixels_x','detachable_keyboard' , 
                      'ram', 'ssd','storage', 'weight']

#Categrical features to pass down the categorical pipeline 
cateforical_features = ['brand', 'screen_surface','touchscreen', 
                        'cpu', 'pixels_y', 'discrete_gpu','gpu', 'os']

# define all features
features = numerical_features + cateforical_features

# target variables
target = ['min_price','max_price']

In [44]:
# train
X_train = df_train[features]
y_train = df_train[target]

# validation (this is kind our own test set)
X_val  = df_val[features]
y_val = df_val[target]

# train_validation (this is all training data we have) for fitting the model
X_all_train = df_all_train[features]
y_all_train = df_all_train[target]

# test
X_test = df_test[features]

In [49]:
# you can add many more and 
# you can even define custom preprocessing steps like 'MakeLowerCase()'

# pipeline  numerical features, 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# pipeline categorical features
categorical_transformer = Pipeline(steps=[
    ('lowercase', MakeLowerCase()), # lower cases all columns containing strings
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# add both preprocessing pipelines in one pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, cateforical_features)])

# Models
## 1) Random Forest

### 1.A) Training and parameter tuning

In [50]:
# define model: I just add some default parameters but you could
# also just write: RandomForestRegressor() since we will perform a grid search 
# to find good hyperparameter values
model_rf = RandomForestRegressor(random_state=1)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model_rf.get_params())

# add to pipeline
pipeline_rf = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_rf)])


# add transformation on the target variable, by default power transformation 
# also performs standardization after performing the power transformation
# and back transform to the original space when outputting predictions 
transformer_target = PowerTransformer(method='yeo-johnson',standardize=True)
pipeline_rf_update = TransformedTargetRegressor(regressor=pipeline_rf, 
                                         transformer=transformer_target)

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [51]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 800, stop = 4000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 20)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid_rf = {
               'regressor__regressor__n_estimators': n_estimators,
               'regressor__regressor__max_features': max_features,
               'regressor__regressor__max_depth': max_depth,
               'regressor__regressor__min_samples_split': min_samples_split,
               'regressor__regressor__min_samples_leaf': min_samples_leaf,
               'regressor__regressor__bootstrap': bootstrap}

## Helpful articles:
 - custom scoring metric: https://stackoverflow.com/questions/48468115/how-to-create-a-customized-scoring-function-in-scikit-learn-for-scoring-a-set-of
 - random parameter search: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [None]:
# define random search (and narrow down time grid search)
rf_random_search = RandomizedSearchCV(
   estimator = pipeline_rf_update, 
   param_distributions = random_grid_rf, n_iter = 20,  
   cv = 10, verbose=2, random_state=1, n_jobs = -1, refit=True,
   scoring=make_scorer(custom_scoring_func, greater_is_better=False)
)


# run grid search and refit with best hyper parameters
rf_random_search.fit(X_train, y_train)  
print(rf_random_search.best_params_)    
print(rf_random_search.best_score_)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


- **TODO make a plot that visualizes hyperparameters (maybe kind of heatmap)**
- Once we have found good tuning parameters write them down so we don't need to redo this step over and over

In [None]:
# have look at the best hyperparameters and their respective performance (maybe also look at the sd)
pd.DataFrame(rf_random_search.cv_results_).sort_values(
    by=['mean_test_score'],ascending=False).head(5)

### B) Performance on validation data

In [None]:
calculate_perf(y_val, rf_random_search.predict(X_val))

## C) Post processing

 - inspect predictions/residuals (make visualisations)
 - feature importance

## D) Predictions test data

Refit on all training data (using the parameters found on the random search) and submit prediction

In [None]:
# train your final model on all data with best parameters 
model_rf_final = RandomForestRegressor(
     n_estimators=2821, 
     max_depth=31,
     max_features='sqrt',
     min_samples_split=2,
     min_samples_leaf=1,
     n_jobs=-1
)
# add to pipeline
pipeline_rf_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_rf_final)],
              verbose=True)

# again add transformer for target variable
pipeline_rf_final = TransformedTargetRegressor(regressor=pipeline_rf_final, 
                                         transformer=transformer_target)

# fit final model on all training data we have at hand
pipeline_rf_final = pipeline_rf_final.fit(X_all_train, y_train_val)

In [None]:
# make predictions on test data
rf_pred_test = pipeline_rf_final.predict(X_test)

In [None]:
# submission format
rf_submission_format = pd.DataFrame.from_dict(
 {'ID':df_test['id'].values,
 'MIN':rf_pred_test[:,0],
 'MAX':rf_pred_test[:,1]}).set_index('ID')

In [None]:
# write to csv
rf_submission_format.to_csv('../output/predictions_test/rf_python.csv' ,
                            header=True, index=True, sep=',')