In [1]:
# auto reload libraries (you do need to re-import libraries if you make changes)
%load_ext autoreload
%autoreload 2

# base 
import pandas as pd
import numpy as np
from pprint import pprint
import os

# preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import TruncatedSVD

# models
from sklearn.ensemble import GradientBoostingRegressor

# own defined functions/classes 
from preprocessing.preprocess_pipe import MakeLowerCase
from preprocessing.preprocess_pipe import HdResolutionCategorizer
from preprocessing.preprocess_pipe import StorageCategorizer
from preprocessing.preprocess_pipe import SsdCategorizer

from preprocessing.preprocess_pipe import print_missing
from preprocessing.preprocess_pipe import calculate_perf
from preprocessing.preprocess_pipe import custom_scoring_func_single_p

from modelling.weight_samples import weights_samples

In [2]:
# global parameters
pd.set_option('display.max_columns', 500)

# set working directory
uppath = lambda _path, n: os.sep.join(_path.split(os.sep)[:-n])
__file__ = 'C:\\Users\\Pieter-Jan\\Documents\\KuLeuven\\Semester2\\AA\\AAForB_Assignment2\\code\\PieterJan'
out = uppath(__file__, 2)
os.chdir(out)
print(os.getcwd())

C:\Users\Pieter-Jan\Documents\KuLeuven\Semester2\AA\AAForB_Assignment2


# Step 1: import data and split in train and validation set
The validation set is more our own kind of test set

In [3]:
# read in trainig and validation data
# use the same data split as we did in R
df_all_train = pd.read_csv("data\\train.csv", sep=',')
df_test = pd.read_csv("data\\test.csv", sep=',')

print(f'Dimensions of all training data {df_all_train.shape}')
print(f'Dimension test data {df_test.shape}')

Dimensions of all training data (510, 22)
Dimension test data (222, 20)


In [4]:
# split in training and validation set
df_train = df_all_train.sample(frac=0.75, random_state=0, replace=False)
df_val = df_all_train.drop(df_train.index)

# reset index, if you don't resit missing rows get inserted in the pipeline
# see: https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
df_train = df_train.reset_index().drop('index',axis=1)
df_val = df_val.reset_index().drop('index',axis=1)

# Step 2: define pipeline

In [5]:
#Numerical features to pass down the numerical pipeline 
numerical_features = ['screen_size' ,'pixels_x','pixels_y',
                      'ram', 'weight','ssd','storage']

#Categrical features to pass down the categorical pipeline 
categorical_features = ['brand','base_name', 'screen_surface','touchscreen',
                        'cpu','cpu_details','detachable_keyboard',
                        'discrete_gpu','gpu', 'os','os_details']

# define all unique features
features = np.unique(numerical_features).tolist() + np.unique(categorical_features).tolist()

# target variables
target = ['min_price','max_price']

In [6]:
# train
X_train = df_train[features]
y_train = df_train[target]

# validation (this is kind our own test set)
X_val  = df_val[features]
y_val = df_val[target]

# train_validation (this is all training data we have) for fitting the model
X_all_train = df_all_train[features]
y_all_train = df_all_train[target]

# test
X_test = df_test[features]

In [7]:
# you can add many more and 
# you can even define custom preprocessing steps like 'MakeLowerCase()'

# pipeline  uses only numerical features,
numeric_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    #('imputer', KNNImputer(n_neighbors=5, weights='uniform',metric='nan_euclidean')),
    ('imputer', IterativeImputer(missing_values=np.nan, random_state=0, n_nearest_features=None, sample_posterior=True)),
    #('transformation', PowerTransformer(method='yeo-johnson',standardize=False)),
     #Scale features using statistics that are robust to outliers.
    ('scaler', RobustScaler(with_centering=True, with_scaling=True, quantile_range=(10.0, 90.0)))]) 

# pipeline use only categorical features
categorical_transformer = Pipeline(steps=[ 
    ('lowercase', MakeLowerCase()), # lower cases all columns containing strings
    #('sd_category' ,SsdCategorizer(drop_original_feature=True)),
    #('storage_category', StorageCategorizer(drop_original_feature=True)),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# add both preprocessing pipelines in one pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [8]:
# this is how the preporcessed pipeline looks like (just to have an idea)
pd.DataFrame(preprocessor.fit_transform(X_train)).shape

(382, 1)

# Step 3: add models to pipeline

Possible to fit multiple target variabels, so you **don't** need to fit a different models for min. price and max. price

### A) Training and parameter tuning

##### 1) Automatic tuning via grid search

I will only **do the tuning for the minimum price** and use the found parameters also for the maximum price

In [9]:
# define model: I just add some default parameters but you could
# also just write: RandomForestRegressor() since we will perform a grid search 
# to find good hyperparameter values
model_min_p = GradientBoostingRegressor(random_state=1)
model_max_p = GradientBoostingRegressor(random_state=1)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model_min_p.get_params())

# add to pipeline
# 1) min price
pipeline_min_p = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_min_p)])


# add transformation on the target variable, by default power transformation 
# also performs standardization after performing the power transformation
# and back transform to the original space when outputting predictions 

# 1) min price
transformer_target = PowerTransformer(method='yeo-johnson',standardize=True)
pipeline_min_p_update = TransformedTargetRegressor(regressor=pipeline_min_p, 
                                         transformer=transformer_target)

Parameters currently in use:

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 1,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [19]:
# loss function
loss_function = ['ls', 'lad']

learning_rate = [0.1, 0.01, 0.005, 0.001]

# The number of boosting stages to perform. 
# Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.
n_estimators = [int(x) for x in np.linspace(start = 2000, stop = 5000, num = 10)]

# The number of features to consider when looking for the best split:
# - If “auto”, then max_features=n_features.
# - If “sqrt”, then max_features=sqrt(n_features).
# - If “log2”, then max_features=log2(n_features).
# - If None, then max_features=n_features
max_features = ['auto', 'sqrt','log2']

# maximum depth of the individual regression estimators
max_depth = [int(x) for x in np.linspace(20, 80, num = 10)]

n_iter_no_change = [10,20,50]

# The function to measure the quality of a split
criterion = ['friedman_mse']

# Minimum number of samples required to split a node
# - If int, then consider min_samples_split as the minimum number.
# - If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
min_samples_split = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7 , 0.9]
# Minimum number of samples required at each leaf node

# - If int, then consider min_samples_leaf as the minimum number.
# - If float, then min_samples_leaf is a fraction and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.
min_samples_leaf = [0.001, 0.01, 0.1, 0.3, 0.5]

# Create the random grid
random_grid = {
   'regressor__regressor__n_estimators': n_estimators,
   'regressor__regressor__learning_rate': learning_rate,
   'regressor__regressor__max_features': max_features,
   'regressor__regressor__max_depth': max_depth,
   'regressor__regressor__criterion': criterion,
   'regressor__regressor__min_samples_split': min_samples_split,
   'regressor__regressor__min_samples_leaf': min_samples_leaf,
   'regressor__regressor__n_iter_no_change': n_iter_no_change,
   'regressor__regressor__loss': loss_function 
}

In [51]:
# define random search (and narrow down time grid search)
min_p_random_search = RandomizedSearchCV(
   estimator = pipeline_min_p_update, 
   param_distributions = random_grid, n_iter = 20,
   cv = 5, verbose=2, random_state=1, n_jobs = -1, refit=True,
   scoring=make_scorer(custom_scoring_func_single_p, greater_is_better=False)
)

# run grid search and refit with best hyper parameters
weights_train_min_p =  weights_samples(df=y_train.iloc[:,0], order=0, plot_weights=False)
min_p_random_search.fit(X_train, y_train.iloc[:,0])  
print(min_p_random_search.best_params_)    
print(min_p_random_search.best_score_)

Sum weights: 1.0
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.5min finished


[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.4s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=  50.5s
{'regressor__regressor__n_iter_no_change': 50, 'regressor__regressor__n_estimators': 5000, 'regressor__regressor__min_samples_split': 0.01, 'regressor__regressor__min_samples_leaf': 0.001, 'regressor__regressor__max_features': 'log2', 'regressor__regressor__max_depth': 46, 'regressor__regressor__loss': 'lad', 'regressor__regressor__learning_rate': 0.01, 'regressor__regressor__criterion': 'friedman_mse'}
-160.0080915995132


In [52]:
# have look at the best hyperparameters and their respective performance (maybe also look at the sd)
pd.DataFrame(min_p_random_search.cv_results_).sort_values(
    by=['mean_test_score'],ascending=False).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__regressor__n_iter_no_change,param_regressor__regressor__n_estimators,param_regressor__regressor__min_samples_split,param_regressor__regressor__min_samples_leaf,param_regressor__regressor__max_features,param_regressor__regressor__max_depth,param_regressor__regressor__loss,param_regressor__regressor__learning_rate,param_regressor__regressor__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,42.46807,15.166462,0.097424,0.034483,50,5000,0.01,0.001,log2,46,lad,0.01,friedman_mse,"{'regressor__regressor__n_iter_no_change': 50,...",-184.336493,-144.072509,-181.548667,-167.570625,-122.512164,-160.008092,23.549721,1
16,5.357716,1.781079,0.047893,0.012286,20,4333,0.5,0.01,auto,40,lad,0.005,friedman_mse,"{'regressor__regressor__n_iter_no_change': 20,...",-201.646441,-164.601983,-174.201127,-167.351437,-146.89578,-170.939354,17.802771,2
11,3.945026,0.843686,0.05548,0.024793,20,2000,0.1,0.1,auto,73,ls,0.005,friedman_mse,"{'regressor__regressor__n_iter_no_change': 20,...",-189.116588,-153.472549,-208.643365,-173.647745,-137.795585,-172.535166,25.100619,3
13,5.249522,0.672106,0.050315,0.011809,20,3333,0.001,0.01,auto,46,ls,0.005,friedman_mse,"{'regressor__regressor__n_iter_no_change': 20,...",-230.135074,-135.010345,-202.963229,-183.595405,-130.531215,-176.447054,38.63127,4
9,1.515796,0.355725,0.051715,0.017439,50,4000,0.001,0.1,log2,60,ls,0.1,friedman_mse,"{'regressor__regressor__n_iter_no_change': 50,...",-232.980835,-176.754541,-208.912433,-181.249874,-143.960496,-188.771636,30.229557,5


##### 2) Manual parameters tuning

In [53]:
# train your final model on all data with best parameters 

# 1) min price
model_min_p = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=5000,
     learning_rate=0.005,
     loss = 'lad',
     max_depth=56,
     max_features='log2',
     n_iter_no_change = 46,
     min_samples_split=0.001,
     min_samples_leaf=0.01,
     random_state=0
)

# 2)  min price
model_max_p = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=2000, 
     learning_rate=0.005,
     loss = 'lad',
     max_depth=56,
     max_features='log2',
     n_iter_no_change = 46,
     min_samples_split=0.001,
     min_samples_leaf=0.01,
     random_state=0
)


# add to pipeline

# 1) min price
pipeline_min_p = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_min_p)],
              verbose=True)

# 2) min price
pipeline_max_p = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_max_p)],
              verbose=True)

# again add transformer for target variable

# 1) min price
pipeline_min_p_update = TransformedTargetRegressor(regressor=pipeline_min_p, 
                                         transformer=transformer_target)
# 2) max price
pipeline_max_p_update = TransformedTargetRegressor(regressor=pipeline_max_p, 
                                         transformer=transformer_target)

# fit final model on all training data we have at hand

# 1) min price
weights_train_min_p = weights_samples(y_train.iloc[:,0], order=0)
pipeline_min_p = pipeline_min_p_update.fit(X_train, y_train.iloc[:,0])

# 2) max price
weights_train_max_p = weights_samples(y_train.iloc[:,1], order=0)
pipeline_max_p = pipeline_max_p_update.fit(X_train, y_train.iloc[:,1])

Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.5s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=  15.6s
Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.5s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   6.8s


In [54]:
# performance on data where the model was fit one (should be very low)
pred_train_min_p = pipeline_min_p.predict(X_train)
pred_train_max_p = pipeline_max_p.predict(X_train)

# calculate performance
pred_train = pd.DataFrame([pred_train_min_p,pred_train_max_p]).T
calculate_perf(y_train, pred_train)

{'minimum price': 125.62047511214338,
 'maximum price': 151.68247344955546,
 'total error': 277.3029485616988}

##### Performance on validation data

In [55]:
# performance on validation data
pred_val_min_p = pipeline_min_p.predict(X_val)
pred_val_max_p = pipeline_max_p.predict(X_val)

# calculate performance 
pred_val = pd.DataFrame([pred_val_min_p,pred_val_max_p]).T
calculate_perf(y_val, pred_val)

{'minimum price': 126.50532149437015,
 'maximum price': 131.12928454823998,
 'total error': 257.63460604261013}

### C) Post processing

 - inspect predictions/residuals (make visualisations) (See Bram)
 - feature importance (see Bram)

### D) Predictions test data

In [48]:
# train your final model on all data with best parameters 

# 1) min price
model_min_p_final = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=2000,
     learning_rate=0.005,
     loss = 'lad',
     max_depth=56,
     max_features='sqrt',
     n_iter_no_change = 50,
     min_samples_split=0.3,
     min_samples_leaf=0.01,
     random_state=0
)

# 2)  min price
model_max_p_final = GradientBoostingRegressor(
     criterion='friedman_mse',
     n_estimators=2000, 
     learning_rate=0.005,
     loss = 'lad',
     max_depth=56,
     max_features='sqrt',
     n_iter_no_change = 50,
     min_samples_split=0.3,
     min_samples_leaf=0.01,
     random_state=0
)


# add to pipeline

# 1) min price
pipeline_min_p_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_min_p_final)],
              verbose=True)

# 2) min price
pipeline_max_p_final = Pipeline(memory=None,
              steps=[('preprocessor', preprocessor),
                     ('regressor', model_max_p_final)],
              verbose=True)

# again add transformer for target variable

# 1) min price
pipeline_min_p = TransformedTargetRegressor(regressor=pipeline_min_p_final, 
                                         transformer=transformer_target)
# 2) max price
pipeline_max_p = TransformedTargetRegressor(regressor=pipeline_max_p_final, 
                                         transformer=transformer_target)

# fit final model on all training data we have at hand

# 1) min price
weights_all_train_min_p = weights_samples(y_all_train.iloc[:,0], order=2)
pipeline_min_p_final = pipeline_min_p.fit(X_all_train, y_all_train.iloc[:,0])

# 2) max price
weights_all_train_max_p = weights_samples(y_all_train.iloc[:,1], order=2)
pipeline_max_p_final = pipeline_max_p.fit(X_all_train, y_all_train.iloc[:,1])

Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.2s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   3.8s
Sum weights: 1.0
[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.1s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   3.7s


In [49]:
# performance on all data where the model was fit one (should be very low)
pred_all_train_min_p = pipeline_min_p_final.predict(X_all_train)
pred_all_train_max_p = pipeline_max_p_final.predict(X_all_train)

# calculate performance
pred_all_train = pd.DataFrame([pred_all_train_min_p, pred_all_train_max_p]).T
calculate_perf(y_all_train, pred_all_train)

{'minimum price': 121.02430493986304,
 'maximum price': 126.12226313119426,
 'total error': 247.1465680710573}

In [50]:
# make predictions on test data
pred_test_min_p = pipeline_min_p_final.predict(X_test)
pred_test_max_p = pipeline_max_p_final.predict(X_test)

# submission format
submission_format = pd.DataFrame.from_dict(
 {'ID':df_test['id'].values,
 'MIN':pred_test_min_p,
 'MAX':pred_test_max_p}).set_index('ID')

# write to csv
submission_format.to_csv('code\\PieterJan\\python\\output\\submission\\boosting_python.csv' ,
                            header=True, index=True, sep=',')