In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import numpy as np
from sklearn import datasets

In [17]:
# load data
cal = datasets.fetch_california_housing()
data = cal['data']
targets = cal['target']

# Task 1

In [20]:
# 1a
clf = LinearRegression()
scores = cross_val_score(clf, data, targets, cv=5)
print('Linear Regression Scores', scores)


clf = ensemble.GradientBoostingRegressor()
scores = cross_val_score(clf, data, targets, cv=5)
print('Boosting Scores', scores)

Linear Regression [0.54866323 0.46820691 0.55078434 0.53698703 0.66051406]
Boosting [0.60265411 0.69877396 0.7180226  0.65023363 0.67975317]


In [35]:
# 1b
tuned_parameters = [{'max_depth': [3, 10],
                     'n_estimators': [50, 100],
                     'learning_rate': [0.01, 0.1]}]
clf = ensemble.GradientBoostingRegressor()
clf = GridSearchCV(clf, tuned_parameters)
clf.fit(data, targets)

print("Scores for parameter grid search:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))



Scores for parameter grid search:

0.334 (+/-0.055) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.476 (+/-0.060) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
0.408 (+/-0.081) for {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 50}
0.562 (+/-0.098) for {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100}
0.644 (+/-0.082) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
0.680 (+/-0.051) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.646 (+/-0.112) for {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 50}
0.648 (+/-0.108) for {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100}


1c) Briefly discuss the performance and summarize your findings.





# Part 2

In [36]:
new_targets = np.array([x>2 for x in targets])

In [46]:
# 2a
clf = LogisticRegression()
scores = cross_val_score(clf, data, new_targets, cv=5)
print('Logistic Regression Scores', scores)

clf = ensemble.GradientBoostingClassifier()
scores = cross_val_score(clf, data, new_targets, cv=5)
print('Boosting Classifier Scores', scores)



Logistic Regression Scores [0.80988133 0.79796512 0.77616279 0.74612403 0.82481221]
Boosting Classifier Scores [0.79099055 0.75436047 0.80741279 0.75339147 0.82650836]


In [None]:
# 2b
tuned_parameters = [{'max_depth': [3, 5],
                     'n_estimators': [100, 200],
                     'learning_rate': [0.1, 0.5]}]
clf = ensemble.GradientBoostingClassifier()
clf = GridSearchCV(clf, tuned_parameters)
clf.fit(data, new_targets)

print("Scores for parameter grid search:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))



In [69]:
# 2c
clf = LogisticRegression()
scores = cross_val_score(clf, data, new_targets, cv=5, scoring='roc_auc')
print('Logistic Regression ROC AUC scores', scores)


clf = ensemble.GradientBoostingClassifier()
scores = cross_val_score(clf, data, new_targets, cv=5, scoring='roc_auc')
print('Boosting Classifier ROC AUC scores', scores)



Logistic Regression ROC AUC scores [0.88418515 0.88251405 0.86366318 0.8193502  0.90110991]
Boosting Classifier ROC AUC scores [0.88485119 0.84740781 0.90563496 0.89795923 0.91313523]


2d)
