In [1]:
# auto reload libraries (you do need to re-import libraries if you make changes)
%load_ext autoreload
%autoreload 2

# base 
import pandas as pd
import numpy as np
from pprint import pprint
import os

# preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import QuantileTransformer
# variable importance
from sklearn.inspection import permutation_importance
# visualize
import matplotlib.pyplot as plt


# models
from sklearn.ensemble import RandomForestRegressor

# own defined functions/classes 
from preprocessing.preprocess_pipe import MakeLowerCase
from preprocessing.preprocess_pipe import HdResolutionCategorizer
from preprocessing.preprocess_pipe import StorageCategorizer
from preprocessing.preprocess_pipe import SsdCategorizer

from preprocessing.preprocess_pipe import print_missing
from preprocessing.preprocess_pipe import calculate_perf
from preprocessing.preprocess_pipe import custom_scoring_func

from modelling.weight_samples import weights_samples

from postprocessing.postprocessing import plot_predictions_results
from postprocessing.postprocessing import plot_residuals

In [2]:
# global parameters
pd.set_option('display.max_columns', 500)
# style for plotting
plt.style.use('ggplot')
# interactive plotting
%matplotlib widget
# run grid search
RUN_GRID_SEARCH = True
# set working directory
uppath = lambda _path, n: os.sep.join(_path.split(os.sep)[:-n])
__file__ = 'C:\\Users\\Pieter-Jan\\Documents\\KuLeuven\\Semester2\\AA\\AAForB_Assignment2\\code\\PieterJan'
out = uppath(__file__, 2)
os.chdir(out)
print(os.getcwd())

C:\Users\Pieter-Jan\Documents\KuLeuven\Semester2\AA\AAForB_Assignment2


## TODO

### PLEASE DEFINE FUNCTIONS IN SEPERATE FILES TO KEEP THE NOTEBOOK CLEAN

- Add custum scoring metric: **DONE**
- further integrate preprocessing steps from Arnaud **DONE**
- Integerate steps from Victor **Partly DONE**
- Look at comments of Victor for next steps
- Look at feature engineering ideas
- Integrate steps from Bram 
- Integrate custum scoring metric (also when performing the cross validation/grid search) **DONE**
- Make function that checkts whether prediction of the maximum price are >= predictions of the minimum price
- Investigate why there is such a huge difference between the performance on 'our test set' and the performance of submission test set
- Make function that saves trained models
- Try out other models (Boosting, Support Vector Machines, Penalized Linear models like lasso, stacked models ...)
- Implement better missing values imputations methods **DONE**
- Add some post processing visualizations:
    - feature importance plots
    - look at our predictions visually (do they make sense?)
    - look at the residuals
    - Have a look at this especially section 5 for model interpretability ideas
    https://christophm.github.io/interpretable-ml-book/pdp.html


# Step 1: import data and split in train and validation set
The validation set is more our own kind of test set

In [3]:
# read in trainig and validation data
# use the same data split as we did in R
df_all_train = pd.read_csv("data\\train.csv", sep=',')
df_test = pd.read_csv("data\\test.csv", sep=',')

print(f'Dimensions of all training data {df_all_train.shape}')
print(f'Dimension test data {df_test.shape}')

Dimensions of all training data (510, 22)
Dimension test data (222, 20)


In [4]:
# split in training and validation set
df_train = df_all_train.sample(frac=0.75, random_state=0, replace=False)
df_val = df_all_train.drop(df_train.index)

# reset index, if you don't resit missing rows get inserted in the pipeline
# see: https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
df_train = df_train.reset_index().drop('index',axis=1)
df_val = df_val.reset_index().drop('index',axis=1)

# Step 2: define pipeline

The pipeline should handle all steps performed on the data from data cleaning till the acutal model prediction. 
It should also be fairly easily to apply this to the test data. Therefore we make use of the scikit learn pipeline which allows all of this functionality. You can also implement your own `custom` transformations.

For more information on how to use and why using pipelines see:
 - https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf 
 - scikit learn api on pipelines: https://scikit-learn.org/stable/modules/compose.html
 - Scikit learn api for more preprocessing steps: https://scikit-learn.org/stable/modules/preprocessing.html#non-linear-transformation

We make a difference between the pre processing steps for
- numerical features
- categorical features

It's important to note that if a feature is in the numerical feature pipeline it can't be in the categorical feature pipeline. Also if some preprocessing steps in the numerical feature pipeline need features from the categorical pipeline you will get an error. All preprocessing steps in each pipeline should be able to perform the transformation only with the categorical or numerical features. 

You can implement some custom transformation steps like 'MakeLowerCase' ==> see preprocessing.preprocess_pipe for more details or check this article  these two article (also gives some explanation on how to implement pipelines)

- https://gist.github.com/amberjrivera/8c5c145516f5a2e894681e16a8095b5c

In [85]:
#Numerical features to pass down the numerical pipeline 
numerical_features = ['screen_size' ,'pixels_x','pixels_y',
                      'ram', 'weight','ssd','storage']

#Categrical features to pass down the categorical pipeline 
categorical_features = ['brand','base_name', 'screen_surface','touchscreen',
                        'cpu','cpu_details','detachable_keyboard',
                        'discrete_gpu','gpu', 'os','os_details']

# define all unique features
features = np.unique(numerical_features).tolist() + np.unique(categorical_features).tolist()

# target variables
target = ['min_price','max_price']

In [86]:
# train
X_train = df_train[features]
y_train = df_train[target]

# validation (this is kind our own test set)
X_val  = df_val[features]
y_val = df_val[target]

# train_validation (this is all training data we have) for fitting the model
X_all_train = df_all_train[features]
y_all_train = df_all_train[target]

# test
X_test = df_test[features]

In [87]:
# you can add many more and 
# you can even define custom preprocessing steps like 'MakeLowerCase()'

# pipeline  uses only numerical features,
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('imputer', KNNImputer(n_neighbors=5, weights='uniform',metric='nan_euclidean')),
    #('imputer', IterativeImputer(missing_values=np.nan, random_state=0, n_nearest_features=None, sample_posterior=True)),
    #('transformation', PowerTransformer(method='yeo-johnson',standardize=False)),
     #Scale features using statistics that are robust to outliers.
    ('scaler', RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25, 75.0)))]) 

# pipeline use only categorical features
categorical_transformer = Pipeline(steps=[ 
    ('lowercase', MakeLowerCase()), # lower cases all columns containing strings
    #('sd_category' ,SsdCategorizer(drop_original_feature=False)),
    #('storage_category', StorageCategorizer(drop_original_feature=True)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# add both preprocessing pipelines in one pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [88]:
# this is how the preporcessed pipeline looks like (just to have an idea)
pd.DataFrame(preprocessor.fit_transform(X_train)).shape

(382, 1)

### Give more weights to higher priced computers

# Step 3: add models to pipeline

Possible to fit multiple target variabels, so you **don't** need to fit a different models for min. price and max. price

### A) Training and parameter tuning

##### 1) Automatic tuning via grid search

In [89]:
# define model: I just add some default parameters but you could
# also just write: RandomForestRegressor() since we will perform a grid search 
# to find good hyperparameter values
model = RandomForestRegressor(random_state=1, criterion='mse')

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model.get_params())

# add to pipeline
pipeline = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model)])


# 1) min price
transformer_target = PowerTransformer(method='yeo-johnson',standardize=False)
scale_target = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(10.0, 90.0))
pipeline_y = Pipeline(memory=None,
              steps=[('transformer', transformer_target)])

pipeline_update = TransformedTargetRegressor(regressor=pipeline, 
                                         transformer=pipeline_y)

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [12]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 4000, num = 20)]

# The number of features to consider when looking for the best split:
# - If “auto”, then max_features=n_features.
# - If “sqrt”, then max_features=sqrt(n_features).
# - If “log2”, then max_features=log2(n_features).
# - If None, then max_features=n_features
max_features = ['auto', 'sqrt','log2',None]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 200, num = 20)]
max_depth.append(None)

# The function to measure the quality of a split
criterion = ['mse','mae']

# Minimum number of samples required to split a node
# - If int, then consider min_samples_split as the minimum number.
# - If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
min_samples_split = [0.001, 0.01, 0.025, 0.05]
# Minimum number of samples required at each leaf node

# - If int, then consider min_samples_leaf as the minimum number.
# - If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.
min_samples_leaf = [0.001, 0.01, 0.025, 0.05,]

# Create the random grid
random_grid = {
   'regressor__regressor__n_estimators': n_estimators,
   'regressor__regressor__max_features': max_features,
   'regressor__regressor__max_depth': max_depth,
   'regressor__regressor__criterion': criterion,
   'regressor__regressor__min_samples_split': min_samples_split,
   'regressor__regressor__min_samples_leaf': min_samples_leaf}

#### Helpful articles:
 - custom scoring metric: https://stackoverflow.com/questions/48468115/how-to-create-a-customized-scoring-function-in-scikit-learn-for-scoring-a-set-of
 - random parameter search: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
you can also use the traditional grid search in pyhton, but I prefer the randomizedSearch
 - Once the optimal parameters are found for the model, you don't need to run this again

In [58]:
# define random search (and narrow down time grid search)
if RUN_GRID_SEARCH:
    random_search = RandomizedSearchCV(
       estimator = pipeline_update, 
       param_distributions = random_grid, n_iter = 10,
       cv = 5, verbose=2, random_state=1, n_jobs = -1, refit=True,
       scoring=make_scorer(custom_scoring_func, greater_is_better=False)
    )


    # run grid search and refit with best hyper parameters
    weights_train_min_p =  weights_samples(df=y_train.iloc[:,0], order=0, plot_weights=False)
    random_search.fit(X_train, y_train)  
    print(random_search.best_params_)    
    print(random_search.best_score_)
    
# have look at the best hyperparameters and their respective performance (maybe also look at the sd)
pd.DataFrame(random_search.cv_results_).sort_values(
    by=['mean_test_score'],ascending=False).head(5)

Sum weights: 1.0


NameError: name 'regressor__sample_weight' is not defined

##### Performance on validation data

##### 2) Manual parameters tuning

In [115]:
# train your final model on all data with best parameters 
model = RandomForestRegressor(
     criterion='mse',
     n_estimators=2710, 
     max_depth=105,
     max_features='log2',
     min_samples_split=0.001,
     min_samples_leaf=0.001,
     bootstrap=False,
     n_jobs=-1,
     random_state=1
)
# add to pipeline
pipeline = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model)],
              verbose=True)

# again add transformer for target variable
pipeline_update = TransformedTargetRegressor(regressor=pipeline, 
                                         transformer=pipeline_y)

# fit final model on all training data we have at hand
weights_train = weights_samples(y_train.iloc[:,1], order=2)
pipeline_update.fit(X_train, y_train)

Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.0s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   4.6s


TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
                           regressor=Pipeline(memory=None,
                                              steps=[('preprocessor',
                                                      ColumnTransformer(n_jobs=None,
                                                                        remainder='drop',
                                                                        sparse_threshold=0.3,
                                                                        transformer_weights=None,
                                                                        transformers=[('num',
                                                                                       Pipeline(memory=None,
                                                                                                steps=[('imputer',
                                                                                                        KNNIm

In [91]:
# performance on data where the model was fit one (should be very low)
# performance on data where the model was fit one (should be very low)
pred_train = pipeline.predict(X_train)

# calculate performance
calculate_perf(y_train, pred_train)

{'minimum price': 0.19519067453172378,
 'maximum price': 0.31723201841345444,
 'total error': 0.5124226929451783}

##### Performance on validation data

In [116]:
# performance on validation data
pred_val = pipeline_update.predict(X_val)

# calculate performance
calculate_perf(y_val, pred_val)

{'minimum price': 114.05916893459869,
 'maximum price': 116.0190068324625,
 'total error': 230.07817576706117}

In [117]:
# save predictions on validation data
# submission format
submission_format_validation = pd.DataFrame.from_dict(
 {'ID':df_val['id'].values,
 'MIN':pred_val[:,0],
 'MAX':pred_val[:,1]}).set_index('ID')

# write to csv
submission_format_validation.to_csv('code\\PieterJan\\python\\output\\validation\\random_forest.csv' ,
                            header=True, index=True, sep=',')

### C) Post processing

In [118]:
fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

# fitted against true predictions minimum price
plot_predictions_results(ax=axs[0], 
                        y_true=y_val.iloc[:,0], 
                        y_pred=pred_val[:,0], 
                        title="RF Min. Price Test Set", 
                        log_scale=True)

# fitted against true predictions maximum price
plot_predictions_results(ax=axs[1], 
                        y_true=y_val.iloc[:,1], 
                        y_pred=pred_val[:,1], 
                        title="RF Max. Price Test Set", 
                        log_scale=True)

# residuals minimum price
plot_residuals(ax=axs[2], 
               y_true=y_val.iloc[:,0], 
               y_pred=pred_val[:,0], 
               title="", 
               log_scale=False,
               order=1)

# residuals maximum price
plot_residuals(ax=axs[3], 
               y_true=y_val.iloc[:,1], 
               y_pred=pred_val[:,1], 
               title="", 
               log_scale=False,
               order=1)
fig.tight_layout()
plt.savefig('code\\PieterJan\\python\\output\\figures\\random_forest\\fig1.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [119]:
plt.figure(figsize=(8,4))
plt.plot(pred_val[:,1] - pred_val[:,0], label="Predictions Test", linestyle="--")
plt.plot(y_val.iloc[:,1] - y_val.iloc[:,0], label="Truth Test", linestyle=':')
plt.xlabel("Observations")
plt.ylabel("Max. Price - Min. Price")
plt.legend(loc="upper right");
plt.savefig('code\\PieterJan\\python\\output\\figures\\random_forest\\fig2.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [120]:
vip_val = permutation_importance(pipeline_update, X_val, y_val, n_repeats=10,
                                random_state=1, n_jobs=3)

sorted_idx = vip_val.importances_mean.argsort()
fig, ax = plt.subplots()
ax.boxplot(vip_val.importances[sorted_idx].T,
           vert=False, labels=X_val.columns[sorted_idx])
ax.set_title("Permutation Importances (Test set)")
fig.tight_layout()
plt.show()
plt.savefig('code\\PieterJan\\python\\output\\figures\\random_forest\\fig3.png')



Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### D) Predictions test data

Refit on all training data (using the parameters found on the random search) and submit prediction

In [23]:
# train your final model on all data with best parameters 
model_final = RandomForestRegressor(
     criterion='mse',
     n_estimators=2710, 
     max_depth=105,
     max_features='log2',
     min_samples_split=0.001,
     min_samples_leaf=0.001,
     bootstrap=False,
     n_jobs=-1,
     random_state=1
)
# add to pipeline
pipeline_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_final)],
              verbose=True)

# again add transformer for target variable
pipeline_final = TransformedTargetRegressor(regressor=pipeline_final, 
                                         transformer=pipeline_y)

# fit final model on all training data we have at hand and give more weight to certain samples
weights_all_train = weights_samples(y_all_train.iloc[:,1], order=4)
pipeline_final = pipeline_final.fit(X_all_train, y_all_train, regressor__sample_weight=weights_all_train)

Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   5.7s


In [24]:
# performance on all data where the model was fit one (should be very low)
calculate_perf(pipeline_final.predict(X_all_train), y_all_train)

{'minimum price': 0.891189468886219,
 'maximum price': 0.9839949863788819,
 'total error': 1.875184455265101}

In [25]:
# make predictions on test data
pred_test = pipeline_final.predict(X_test)

# submission format
submission_format = pd.DataFrame.from_dict(
 {'ID':df_test['id'].values,
 'MIN':pred_test[:,0],
 'MAX':pred_test[:,1]}).set_index('ID')

# write to csv
submission_format.to_csv('code\\PieterJan\\python\\output\\submission\\random_forest.csv' ,
                            header=True, index=True, sep=',')