In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

## Day 35 Lecture 1 Assignment

In this assignment, we will learn about gradient boosting. We will use a dataset describing survival rates after breast cancer surgery loaded below and analyze the model generated for this dataset.

In [2]:
import warnings

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
)
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# p much in practice:
# *if you want to use GradientBoostingClassifier
#     * use XGBClassifier instead
# *if you want to use GradientBoostingRegressor
#     * use XGBRegressor instead
from xgboost import XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
# Attributes:
# Age of patient at time of operation (numerical)
# Patient's year of operation (year - 1900, numerical)
# Number of positive axillary nodes detected (numerical)
# Survival status (class attribute)
#  -- 1 = the patient survived 5 years or longer
#  -- 2 = the patient died within 5 year

cols = ["age", "op_year", "nodes", "survival"]
cancer = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/haberman.data",
    names=cols,
)

<IPython.core.display.Javascript object>

In [4]:
cancer.head()

Unnamed: 0,age,op_year,nodes,survival
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


<IPython.core.display.Javascript object>

Check for missing data and remove all rows containing missing data

In [5]:
# answer below:
cancer.isna().mean()


age         0.0
op_year     0.0
nodes       0.0
survival    0.0
dtype: float64

<IPython.core.display.Javascript object>

In [6]:
cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       306 non-null    int64
 1   op_year   306 non-null    int64
 2   nodes     306 non-null    int64
 3   survival  306 non-null    int64
dtypes: int64(4)
memory usage: 9.7 KB


<IPython.core.display.Javascript object>

Adjust the target variable so that it has values of either 0 or 1

In [8]:
cancer["survival"] = cancer["survival"] - 1

<IPython.core.display.Javascript object>

In [15]:
# answer below:
cancer['survival'].value_counts(normalize=True)


0    0.735294
1    0.264706
Name: survival, dtype: float64

<IPython.core.display.Javascript object>

Split the data into train and test (20% in test)

In [10]:
X = cancer.drop("survival", 1)
y = cancer["survival"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=34, stratify=y
)

<IPython.core.display.Javascript object>

Create a gradient boosted classification algorithm with a learning rate of 0.01 and max depth of 5. Report the accuracy.

In [11]:
model = XGBClassifier(learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

<IPython.core.display.Javascript object>

In [None]:
model.s

Print the confusion matrix for the test data. What do you notice about our predictions?

In [14]:
# answer below:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}\n")

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Train score: 0.9180327868852459
Test score: 0.6290322580645161

[[34 12]
 [11  5]]
              precision    recall  f1-score   support

           0       0.76      0.74      0.75        46
           1       0.29      0.31      0.30        16

    accuracy                           0.63        62
   macro avg       0.52      0.53      0.53        62
weighted avg       0.64      0.63      0.63        62



<IPython.core.display.Javascript object>

In [16]:

cancer['survival'].value_counts(normalize=True)

0    0.735294
1    0.264706
Name: survival, dtype: float64

<IPython.core.display.Javascript object>

Print the confusion matrix for a learning rate of 1 and a learning rate of 0.5. What do you see now that stands out to you in the confusion matrix?

In [17]:
model = XGBClassifier(learning_rate=1, max_depth=5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

<IPython.core.display.Javascript object>

In [18]:
# answer below:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}\n")

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Train score: 0.9795081967213115
Test score: 0.6129032258064516

[[33 13]
 [11  5]]
              precision    recall  f1-score   support

           0       0.75      0.72      0.73        46
           1       0.28      0.31      0.29        16

    accuracy                           0.61        62
   macro avg       0.51      0.51      0.51        62
weighted avg       0.63      0.61      0.62        62



<IPython.core.display.Javascript object>

In [19]:
model = XGBClassifier(learning_rate=0.5, max_depth=5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

<IPython.core.display.Javascript object>

In [20]:
# answer below:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}\n")

y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Train score: 0.9836065573770492
Test score: 0.5806451612903226

[[32 14]
 [12  4]]
              precision    recall  f1-score   support

           0       0.73      0.70      0.71        46
           1       0.22      0.25      0.24        16

    accuracy                           0.58        62
   macro avg       0.47      0.47      0.47        62
weighted avg       0.60      0.58      0.59        62



<IPython.core.display.Javascript object>

Perform a grid search for the optimal learning rate. Instead of accuracy, use a metric that will help your model predict the positive class.

In [27]:
# answer below:
n_trees = 100
grid = {
    "learning_rate": [2 / n_trees, 4 / n_trees, 6 / n_trees, 8 / n_trees],
    "max_depth": [3, 5],
}

# learning_rate = 2 / n_trees

model_cv = GridSearchCV(
    XGBClassifier(n_estimators=n_trees),
    grid,
    verbose=1,
    n_jobs=-1,
    scoring=make_scorer(fbeta_score, beta=2),
)
model_cv.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.1s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, 

<IPython.core.display.Javascript object>

In [28]:
# answer below:
train_score = model_cv.score(X_train, y_train)
test_score = model_cv.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}\n")

y_pred = model_cv.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Train score: 0.46075085324232085
Test score: 0.4605263157894737

[[41  5]
 [ 9  7]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.85        46
           1       0.58      0.44      0.50        16

    accuracy                           0.77        62
   macro avg       0.70      0.66      0.68        62
weighted avg       0.76      0.77      0.76        62



<IPython.core.display.Javascript object>

List the feature importances for the model with the optimal learning rate.

In [29]:
# answer below:
model_cv.best_params_


{'learning_rate': 0.06, 'max_depth': 3}

<IPython.core.display.Javascript object>