# Big Data Intelligence: Methods and Technologies
# Assignment 2: Hyper-Parameter Tuning
## Authors:  Liana Mehrabyan and Elsa Scola Martín
### Objective:
Given a dataset from the Kaggle competition ”House Prices: Advanced Regression Techniques” the goal of this assignment is to apply different algorithms and optimization techniques to predict House Prices based on the attributes.

### What is done in the Notebook: 
- Load the data and standardize it.
- Evaluate DecisionTreeRegressor and KNeighborsRegressor with default paramters.
- Random Search for DecisionTreeRegressor hyper parameter tuning.
- Random search for KNeighbors hyper parameter tuning.
- Apply Scikit-optimize.
- Best Model and Competition Submission.


# Import the libraries

In [1]:
import pandas as pd
from sklearn import preprocessing, tree, metrics
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, KFold,RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import sem 
from sklearn.decomposition import PCA
import time

# Data Loading, Split and Standardization

In [21]:
data = pd.read_csv("kaggleCompetition.csv") 
data = data.values
X = data[0:1460,:-1]
y = data[0:1460,-1]
X_comp = data[1460:,:-1]
y_comp = data[1460:,-1]
scaler = preprocessing.StandardScaler().fit(X) 
X = scaler.transform(X)
X_comp = scaler.transform(X_comp)
# random_state=0 for reproducibility purposes
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=0)
print(X_train.shape, y_train.shape)

(978, 79) (978,)


# 3.2 Evaluation with Default Hyper Parameters

## DecisionTreeRegressor 
### With Holdout

In [3]:
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print('Training Error:', metrics.mean_squared_error(y_train, y_train_pred))
print('Validation Error:', metrics.mean_squared_error(y_test, y_test_pred))

Training Error: 2.2001910189371395e-10
Validation Error: 0.043841432285095856


### With 2-fold cross-validation

In [4]:
np.random.seed(0)
cv = KFold(n_splits=2, shuffle=True, random_state=0)
scores = -cross_val_score(tree.DecisionTreeRegressor(), 
                         X, y, 
                         scoring='neg_mean_squared_error', 
                         cv = cv)              

print(scores)
print("Mean score: {0:.3f} (+/-{1:.3f})".format(scores.mean(), sem(scores)))

[0.04920897 0.04746221]
Mean score: 0.048 (+/-0.001)


## KNeighborsRegressor
### With Holdout

In [5]:
neigh = KNeighborsRegressor()
neigh.fit(X_train, y_train) 
y_train_pred = neigh.predict(X_train)
y_test_pred = neigh.predict(X_test)
print('Training Error:', metrics.mean_squared_error(y_train, y_train_pred))
print('Validation Error:', metrics.mean_squared_error(y_test, y_test_pred))

Training Error: 0.025037524793595778
Validation Error: 0.030053956149153162


### With 2-fold cross-validation

In [6]:
scores = -cross_val_score(KNeighborsRegressor(), 
                         X, y, 
                         scoring='neg_mean_squared_error', 
                         cv = cv)              
print(scores)
print("Mean score: {0:.3f} (+/-{1:.3f})".format(scores.mean(), sem(scores)))

[0.0350818  0.03709147]
Mean score: 0.036 (+/-0.001)


# 3.3 Random Search for DecisionTreeRegressor hyper parameter tuning
- **min_samples_split**
- **criterion**

In [7]:
start = time.time()

cv_grid = KFold(n_splits=2, shuffle=True, random_state=0)
param_grid = {'min_samples_split': list(np.linspace(0.1,1,10)),
              'criterion': list(['mse','friedman_mse'])}
budget = 20
np.random.seed(0)
clf = RandomizedSearchCV(tree.DecisionTreeRegressor(), 
                         param_grid,
                         scoring='neg_mean_squared_error',
                         cv=cv_grid, 
                         n_jobs=1, verbose=1,
                         n_iter=budget
                        )
clf.fit(X=X_train, y=y_train)
y_test_pred = clf.predict(X_test)
print(metrics.mean_squared_error(y_test, y_test_pred))
print('Best Parameters \n')
clf.best_params_, -clf.best_score_

end = time.time()
print("Execution time: "+str(end - start))


Fitting 2 folds for each of 20 candidates, totalling 40 fits
0.03750209460443526
Best Parameters 

Execution time: 0.1253671646118164


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished


# 3.4 Random search for KNeighbors hyper parameter tuning
- **n_neigbors**
- **weights**
- **p**


In [8]:
start = time.time()


cv_grid = KFold(n_splits=2, shuffle=True, random_state=0)

param_grid = {'n_neighbors': list(range(2,16,2)),
              'weights': list(['uniform','distance']),
              'p': list([1,2])
              
             }
budget = 20
np.random.seed(0)
clf = RandomizedSearchCV(KNeighborsRegressor(), 
                         param_grid,
                         scoring='neg_mean_squared_error',
                         cv=cv_grid, 
                         n_jobs=1, verbose=1,
                         n_iter=budget
                        )

clf.fit(X=X_train, y=y_train)
y_test_pred = clf.predict(X_test)
print(metrics.mean_squared_error(y_test, y_test_pred))
clf.best_params_, -clf.best_score_

end = time.time()
print("Execution time: "+str(end - start))


Fitting 2 folds for each of 20 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.024676127896818324
Execution time: 1.2477571964263916


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    1.1s finished


In [9]:
y_subm = clf.predict(X_comp)
subm= np.exp(y_subm)
subm=pd.DataFrame({'id': np.arange(1461,1461+subm.shape[0]), 'SalePrice':subm})
subm.to_csv('submission_knn.csv', index=False)

# 3.5 Scikit-optimize 

## DecisionTreeRegressor()

In [10]:
start = time.time()

from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from scipy.stats import uniform, expon
from scipy.stats import randint as sp_randint


param_grid = {'min_samples_split': (0.1,1),
              'criterion': list(['mse','friedman_mse'])}
budget = 20
np.random.seed(0)

clf = BayesSearchCV(tree.DecisionTreeRegressor(), 
                    param_grid,
                    scoring='neg_mean_squared_error',
                    cv=3,    
                    n_jobs=1, verbose=1,
                    n_iter=budget
                    )
clf.fit(X=X_train, y=y_train)

y_test_pred = clf.predict(X_test)
print(metrics.mean_squared_error(y_test, y_test_pred))
print('Best Parameters \n')
print(clf.best_params_, -clf.best_score_)



end = time.time()
print("Execution time: "+str(end - start))



Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


0.037502094604435175
Best Parameters 

{'criterion': 'mse', 'min_samples_split': 0.1} 0.04668401064043889
Execution time: 7.077333211898804


## KNeighborsRegressor()

In [11]:
start = time.time()

cv_grid = KFold(n_splits=2, shuffle=True, random_state=0)

param_grid = {'n_neighbors': list(range(2,16,2)),
              'weights': list(['uniform','distance']),
              'p': list([1,2])
              
             }
budget = 20
np.random.seed(0)
clf = BayesSearchCV(KNeighborsRegressor(), 
                    param_grid,
                    scoring='neg_mean_squared_error',
                    cv=3,    
                    n_jobs=1, verbose=1,
                    n_iter=budget
                    )


clf.fit(X=X_train, y=y_train)
y_test_pred = clf.predict(X_test)
print(metrics.mean_squared_error(y_test, y_test_pred))
print(clf.best_params_, -clf.best_score_)



end = time.time()
print("Execution time: "+str(end - start))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


0.024897561454502615
{'n_neighbors': 6, 'p': 1, 'weights': 'distance'} 0.03125142327559783
Execution time: 18.76736044883728


# 3.6 Best Model and Competition Submission
The best result for the algorithms proposed by the assignment were achieved by KNN Regressor Hyperparameter Tuning and ´submission_knn.csv´ file with predictions was generated in section 3.4.

# Model Improvement

## Tuning other parameters of DecisionTreeRegressor()

In [12]:
cv_grid = KFold(n_splits=2, shuffle=True, random_state=0)
param_grid = {'min_samples_split': list(np.linspace(0.1,1,10)),
              'criterion': list(['mse','friedman_mse']),
              'max_depth': [5,10,15,20,30,40,50,60,70]
             }
budget = 20
np.random.seed(0)
clf = RandomizedSearchCV(tree.DecisionTreeRegressor(), 
                         param_grid,
                         scoring='neg_mean_squared_error',
                         cv=cv_grid, 
                         n_jobs=1, verbose=1,
                         n_iter=budget
                        )
clf.fit(X=X_train, y=y_train)
y_test_pred = clf.predict(X_test)
print(metrics.mean_squared_error(y_test, y_test_pred))
print('Best Parameters \n')
clf.best_params_, -clf.best_score_

Fitting 2 folds for each of 20 candidates, totalling 40 fits
0.03750209460443533
Best Parameters 



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished


({'min_samples_split': 0.1, 'max_depth': 60, 'criterion': 'friedman_mse'},
 0.04528607308130875)

In [13]:
y_subm = clf.predict(X_comp)
subm= np.exp(y_subm)
subm=pd.DataFrame({'id': np.arange(1461,1461+subm.shape[0]), 'SalePrice':subm})
subm.to_csv('submission_max_depth.csv', index=False)

## Dimensionality Reduction: PCA

In [14]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
X_comp_pca = pca.transform(X_comp)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y,test_size=0.33, random_state=0)

In [15]:
pca.explained_variance_ratio_

array([0.13117973, 0.05127897])

In [16]:
neigh = KNeighborsRegressor(weights='distance', p=1,n_neighbors=8)
neigh.fit(X_train, y_train) 
y_train_pred = neigh.predict(X_train)
y_test_pred = neigh.predict(X_test)
print('Training Error:', metrics.mean_squared_error(y_train, y_train_pred))
print('Validation Error:', metrics.mean_squared_error(y_test, y_test_pred))
pred = neigh.predict(X_comp_pca)

Training Error: 0.0
Validation Error: 0.03087807394682239


In [17]:
y_subm = neigh.predict(X_comp_pca)
subm= np.exp(y_subm)
subm=pd.DataFrame({'id': np.arange(1461,1461+subm.shape[0]), 'SalePrice':subm})
subm.to_csv('submission_pca.csv', index=False)

## Gradient Boost

In [22]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error


params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print('Training Error:', metrics.mean_squared_error(y_train, y_train_pred))
print('Validation Error:', metrics.mean_squared_error(y_test, y_test_pred))

Training Error: 0.004859807148711824
Validation Error: 0.015365551677876017


In [23]:
y_subm = clf.predict(X_comp)
subm= np.exp(y_subm)
subm=pd.DataFrame({'id': np.arange(1461,1461+subm.shape[0]), 'SalePrice':subm})
subm.to_csv('submission_GB.csv', index=False)