In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Load libraries and read the data¶


# 1.1. Load libraries

In [2]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib.pyplot as plt
from sklearn.tree import  DecisionTreeClassifier
from sklearn.model_selection  import  train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix,classification_report,precision_score, recall_score, roc_curve, precision_recall_curve 
import itertools
from sklearn.preprocessing import StandardScaler
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 1.2. Read the data

In [4]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

# 1.3. Missing Values 

In [5]:
df.isna().sum()

# 1.4. Reassign target and drop useless features

In [6]:
# Drop Useless column
df.drop(['Unnamed: 32','id'], axis =1, inplace = True)

In [7]:
# Reassign target
df.diagnosis.replace(to_replace = dict(M = 1, B = 0), inplace = True)

# 2 Exploratory Data Analysis (EDA)

# 2.1  Head and Describe

In [8]:
df.head()

In [9]:
df.info()

In [10]:
df.describe()

In [11]:
df['diagnosis'].value_counts()

# 3.   **Functions**

In [12]:
# Confusion matrix 
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix"',
                          cmap = plt.cm.Blues) :
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Show metrics 
def show_metrics():
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    print('Accuracy  =     {:.3f}'.format((tp+tn)/(tp+tn+fp+fn)))
    print('Precision =     {:.3f}'.format(tp/(tp+fp)))
    print('Recall    =     {:.3f}'.format(tp/(tp+fn)))
    print('F1_score  =     {:.3f}'.format(2*(((tp/(tp+fp))*(tp/(tp+fn)))/
                                                 ((tp/(tp+fp))+(tp/(tp+fn))))))

In [13]:
# Precision-recall curve
def plot_precision_recall():
    plt.step(recall, precision, color = 'b', alpha = 0.2,
             where = 'post')
    plt.fill_between(recall, precision, step ='post', alpha = 0.2,
                 color = 'b')

    plt.plot(recall, precision, linewidth=2)
    plt.xlim([0.0,1])
    plt.ylim([0.0,1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall Curve')
    plt.show();

# 4.  **Prepare Dataset** 

# 4.1. Define (X, y) 

In [14]:
y = df.diagnosis
X = df.drop('diagnosis', axis=1)
z = X

# 4.2. Standard scaler (X)

In [15]:
# Normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 4.3. Train_test split

In [16]:
# Train_test split
SEED = 1 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state = SEED)

# 5. **Trees** 

# 5.1. DescisionTreeClassifier 

In [17]:

dtc = DecisionTreeClassifier( max_depth=4,min_samples_leaf=0.14, random_state=SEED)



dtc.fit(X_train,y_train)

y_pred = dtc.predict(X_test)

In [18]:
# accuracy_score(X_train,y_train)

In [19]:
accuracy_score(y_test, y_pred)

In [20]:
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tree Confusion matrix')
plt.savefig('3')
plt.show()

show_metrics()

In [21]:
print(classification_report(y_test, y_pred))

# Cross-Validation

In [22]:
SEED =113
dt = DecisionTreeClassifier( max_depth=4,min_samples_leaf=0.14,random_state=SEED)

In [23]:
cv_C = cross_val_score(dt,X_train, y_train, cv=10,scoring ='accuracy', n_jobs=-1)

In [24]:
# various scores
sorted(sklearn.metrics.SCORERS.keys())

In [25]:
# Fit 'dt' to the training set          
dt.fit(X_train, y_train)

In [26]:
# Predict the labels of training set
y_predict_train = dt.predict(X_train)

In [27]:
# Predict the labels of test set
y_predict_test = dt.predict(X_test)

In [28]:
#CV accuracy
print('CV accuracy:{:.2f}'.format(cv_C.mean()))

In [29]:
#train set accuracy
print('Train sccuracy:{:.2f}'.format(accuracy_score(y_train, y_predict_train)))

In [30]:
#test set accuracy
print('Test accuracy:{:.2f}'.format(accuracy_score(y_test, y_predict_test)))

# 5.2. Bagging

In [31]:
# Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier( max_depth=4,min_samples_leaf=0.14,random_state=1)

# Instantiate a BaggingClassifier 'bc'
bc = BaggingClassifier(base_estimator =dt, n_estimators=300, n_jobs=-1)
# fit bc to Train set
bc.fit(X_train, y_train)
# predict 
y_pred = bc.predict(X_test)



In [32]:
# Evaluate and print test-set accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of bagging Classifier:{:.3f}'.format(accuracy))


In [33]:
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tree Confusion matrix')
plt.savefig('4')
plt.show()

show_metrics()

# 5.2. Out-of-bag (OOB)

In [34]:
# Instantiate a BaggingClassifier 'oob_c'; set oob_score= True
oob_c = BaggingClassifier(base_estimator = dt, n_estimators=300, oob_score=True, n_jobs=-1)

# fit bc to Train set
oob_c.fit(X_train, y_train)

# predict
y_pred = oob_c.predict(X_test)

In [35]:
# Evaluate test set accuracy
accuracy = accuracy_score(y_test, y_pred)
# oob_accuracy
oob_accuracy = oob_c.oob_score_
# Print test set accuracy                          
print('test accuracy :{:.3f}'.format(accuracy))
# Print OOB accuracy
print('oob accuracy :{:.3f}'.format( oob_accuracy))

In [36]:
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tree Confusion matrix')
plt.savefig('5')
plt.show()

show_metrics()

# 6.  **RandomForest**

# 6.1. RandomForest

In [37]:
# RandomForest
rf = RandomForestClassifier(n_estimators=400, min_samples_leaf=0.12, random_state=SEED)

In [38]:
#fit bc to Train set
rf.fit(X_train, y_train)
#predict
y_pred = rf.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
                          
print('test accuracy :{:.3f}'.format(accuracy))

In [40]:
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tree Confusion matrix')
plt.savefig('6')
plt.show()

show_metrics()

In [41]:
# Create a pd.Series of features importances
importances_rf = pd.Series(rf.feature_importances_, index = z.columns)
# Sort importances_rf                                   
sorted_importances_rf = importances_rf.sort_values()   
# Make a horizontal bar plot
sorted_importances_rf.plot(kind='barh', color='lightgreen'); plt.show()

# 6.2. Adaboost

In [42]:
# Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth=1, random_state=SEED)
# Instantiate an AdaBoost classifier 'adab_clf'
adb_clf = AdaBoostClassifier(base_estimator=dt, n_estimators=100)

In [43]:
adb_clf.fit(X_train, y_train)

In [44]:
# Predict the test set probabilities of positive class
y_pred_proba = adb_clf.predict_proba(X_test)[:,1]
# Evaluate test-set roc_auc_score
adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)


In [45]:
print('ROC AUC score: {:.2f}'.format(adb_clf_roc_auc_score))

# 6.3 Gradient Boosting 

In [46]:
gbt = GradientBoostingClassifier(n_estimators=300, max_depth=1, random_state=SEED)

In [47]:
gbt.fit(X_train, y_train)
y_pred = gbt.predict(X_test)

In [48]:
accuray = accuracy_score(y_test, y_pred)
print('test accuracy: {:.3f}'.format(accuracy))

In [49]:
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tree Confusion matrix')
plt.savefig('7')
plt.show()

show_metrics()

# Stochastic Gradient Boosting

In [50]:
# Instantiate a stochastic GradientBoostingClassifier 'sgbt'
sgbt = GradientBoostingClassifier(max_depth=1, subsample=0.8, max_features=0.2, n_estimators=300, random_state=SEED)
# Fit 'sgbt' to the training set
sgbt.fit(X_train, y_train)
# Predict the test set labels
y_pred = sgbt.predict(X_test)


In [51]:
accuray = accuracy_score(y_test, y_pred)
print('test accuracy: {:.3f}'.format(accuracy))

In [52]:
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Tree Confusion matrix')
plt.savefig('8')
plt.show()

show_metrics()

# 7. **Hyperparameter Tunig using GridSearchCV**

# 7.2. GridSearchCV for Tree

In [53]:
dt = DecisionTreeClassifier(random_state= SEED)
params_dt = { 
             'max_depth': [3, 4, 5, 6, 7, 8],
             'min_samples_leaf': [0.04, 0.06, 0.08],
             'max_features': [0.2, 0.4, 0.6, 0.8]
            }
grid_dt = GridSearchCV(estimator = dt, param_grid = params_dt, scoring='accuracy',cv=10,n_jobs=-1 )
grid_dt.fit(X_train, y_train)

In [54]:
# Extract best hyperparameters from 'griddt'
best_hyperparams = grid_dt.best_params_
print('Best hyerparameters:\n', best_hyperparams)

In [55]:
# Extract best CV score from 'grid_dt'
best_CV_score = grid_dt.best_score_
print('Best CV accuracy'.format(best_CV_score))

In [56]:
# Extract best model from 'grid_dt'
best_model = grid_dt.best_estimator_

# Evaluate test set accuracy
test_acc = best_model.score(X_test,y_test)

# Print test set accuracy
print("Test set accuracy of best model: {:.3f}".format(test_acc))

# 7.3. GridSearchCV for Forest

In [57]:
clf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 20]
}

In [58]:
grid_rf = GridSearchCV(estimator =clf, param_grid=param_grid, scoring = 'recall', cv=10,n_jobs=-1)
grid_rf.fit(X_train, y_train)

In [59]:
# Extract best model from 'grid_rf'
best_model = grid_rf.best_estimator_

# Evaluate test set accuracy
test_acc = best_model.score(X_test,y_test)

# Print test set accuracy
print("Test set accuracy of best model: {:.3f}".format(test_acc))

In [60]:
best_parameters = grid_rf.best_params_
print('The best parameters for using this model is', best_parameters)

In [61]:
CV_rfc = RandomForestClassifier(max_depth = best_parameters['max_depth'], max_features = best_parameters['max_features'], min_samples_split = best_parameters['min_samples_split'] , 
                               n_estimators =best_parameters['n_estimators'] )

In [62]:
CV_rfc.fit(X_train, y_train)
y_pred = CV_rfc.predict(X_test)

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cm, 
                      classes=class_names, 
                      title='Logistic Confusion matrix')
plt.savefig('9')
plt.show()

show_metrics()

# Recall for various Decision Threshold

In [63]:
# Threshold
thresholds_adj = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

plt.figure(figsize = (15,15))

j = 1
for i in thresholds_adj:
    y_score = CV_rfc.predict_proba(X_test)[:,1] > i
    
    
    plt.subplot(3,3,j)
    j += 1
    
    cm = confusion_matrix(y_test, y_score)
    
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]

    print('Recall w/ threshold = %s :'%i, (tp/(tp+fn)))
    
    class_names = [0,1]
    plot_confusion_matrix(cm, 
                          classes=class_names, 
                          title='Threshold = %s'%i) 

In [68]:
# Recall = 1.
y_score = CV_rfc.predict_proba(X_test)[:,1] > 0.13
cm = confusion_matrix(y_test, y_score)
class_names = [0,1]
show_metrics()