Data inporting and setup


In [1]:

'''imports and setting up our plot as well as dataset setup'''
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn import model_selection

plt.style.use('ggplot')
rcParams['figure.figsize'] = (12,6)

dataset = pd.read_csv("breast_cancer.csv")
# preprocessing of data
dataset.replace('?', -99999, inplace=True)
# get all the data into columns
columns = dataset.columns.tolist()
# remove class and id as they have no effect on the prediction
columns = [c for c in columns if c not in ["Class", "ID"]]
# storing the variable we will prredict with
target = 'Class'

X = dataset[columns]
y = dataset[target]
y = y.map({2: 0, 4: 1})
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, y, train_size=0.7, random_state=0)

In [2]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boosting_model = GradientBoostingClassifier(learning_rate=0.05, n_estimators=72, max_depth=5, min_samples_split=50,
                                       min_samples_leaf=5, subsample=0.8, random_state=10, max_features=3, warm_start=True)
    # learning_rate = determines the impact of each tree on the final outcome, low values preferred to make tree robust but need more trees (n_estimators) to model all the relations
    # n_estimators = number of sequential trees to be modeled, can cause over-fitting so it should be balanced with learning rate
    # max_depth = maximum depth of a tree, higher depth -> more specific -> over-fitting, only ~ 560 samples so low end number 5
    # min_samples_split = minimum number of samples required in a node to be splitted, ~ 0.5-1% of samples
    # min_samples_leaf = minimum number of samples required in a leaf, ~ a 10th of split
    # subsample =  fraction (%) of observations to be selected for each tree, between 0.7 and 1 are good, strengthening values
    # random_state = random number seed so that same random numbers are generated every time, if not fixed -> different outcomes for subsequent runs
    # max_features = number of features to consider while searching for best split, 30-40% of total features, higher value -> CAN cause over-fitting
    # warm_start = fit additional trees on previous fits of a model, can use it to increase the number of estimators in small steps and test different values without having to run from scratch
scoring = ['accuracy', 'precision', 'recall', 'f1']

results = model_selection.cross_validate(
        estimator=gradient_boosting_model, X=X, y=y, cv=5, scoring=scoring, return_train_score=True)
print("Mean Training Accuracy: ", results['train_accuracy'].mean()*100,
          "\nMean Training Precision: ", results['train_precision'].mean(),
          "\nMean Training Recall: ", results['train_recall'].mean(),
          "\nMean Training F1 Score: ", results['train_f1'].mean(),
          "\nMean Validation Accuracy: ", results['test_accuracy'].mean()*100,
          "\nMean Validation Precision: ", results['test_precision'].mean(),
          "\nMean Validation Recall: ", results['test_recall'].mean(),
          "\nMean Validation F1 Score: ", results['test_f1'].mean()
          )
    


Mean Training Accuracy:  98.31919243547149 
Mean Training Precision:  0.9609596423109823 
Mean Training Recall:  0.9916990500863558 
Mean Training F1 Score:  0.9760415668173797 
Mean Validation Accuracy:  95.42651593011306 
Mean Validation Precision:  0.9425372866127584 
Mean Validation Recall:  0.9252551020408163 
Mean Validation F1 Score:  0.9323327327819282


funciton to test the seperate parameters and their results on the train time and precision

In [None]:
def plot_results(model, param = 'n_estimators', name = 'Num Trees'):
    param_name = 'param_%s' % param
    
    # Extract information from the cross validation model
    test_scores = model.cv_results_['mean_test_score']
    train_time = model.cv_results_['mean_fit_time']
    param_values = list(model.cv_results_[param_name])
    
    # Plot the scores over the parameter
    plt.subplots(1, 2, figsize=(10, 6))
    plt.subplot(121)
    

    plt.plot(param_values, test_scores, '-', label = 'test')
    plt.ylim(ymin = -0.5, ymax = 0)
    plt.legend()
    plt.xlabel(name)
    plt.ylabel('Neg Mean Absolute Error')
    plt.title('Score vs %s' % name)
    
    plt.subplot(122)
    plt.plot(param_values, train_time, '-')
    plt.ylim(ymin = 0.0, ymax = 1.25)
    plt.xlabel(name)
    plt.ylabel('Train Time (sec)')
    plt.title('Training Time vs %s' % name)
    
    
    plt.tight_layout(pad = 4)

parameter tests

function to evaluate the models 

In [28]:
#perform first grid search to tune our learning rate to the number of trees as these are linked

from sklearn.model_selection import GridSearchCV


param_test1 = {'n_estimators':range(10,81,10)}
grid_search_num_trees = GridSearchCV(estimator = GradientBoostingClassifier(),
param_grid = param_test1,scoring= 'roc_auc', n_jobs=-1, cv=5)
grid_search_num_trees.fit()

print( grid_search_num_trees.best_params_, grid_search_num_trees.best_score_)




{'n_estimators': 50} 0.9951735825449062


In [33]:
param_test2 = {'max_depth':range(1,10,2), 'min_samples_split':range(1,600,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4, cv=5)
gsearch2.fit(x_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 3, 'min_samples_split': 201}, 0.9964754730563555)

In [36]:
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50,max_depth=3, min_samples_split=200, min_samples_leaf=60, subsample=0.8, random_state=10),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,cv=5)
gsearch4.fit(x_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

({'max_features': 9}, 0.9952753068010421)