# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from scipy.stats import randint


# Loading Dataset

In [2]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()

# Exploring Dataset

Checking features and target names.

In [3]:
# print the names of the 13 features
print("Features: ", cancer.feature_names)

# print the label type of cancer('malignant' 'benign')
print("Labels: ", cancer.target_names)

Features:  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Labels:  ['malignant' 'benign']


# Splitting Dataset

Spliting the dataset by using the function train_test_split(). We need to pass 3 parameters features, target, and test_set size. Additionally we can also use random_state to select records randomly.

In [4]:
# Spliting Dataset into 80% Training and 20% Test sets.
X = cancer.data
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=108)

# Generating Model

### Hyperparameter Tuning

RandomizedSearchCV should be applied after splitting the data but before training the final model. This helps in tuning the hyperparameters using cross-validation and finding the best model configuration.

**Parameters:**
- n_estimators: Random number of trees from 50 to 200
- max_depth: Random max depth from 5 to 20
- min_samples_split: Random split values between 2 and 20
- min_samples_leaf: Random leaf values between 1 and 20
- bootstrap: Boolean values for bootstrap

In [5]:
# Random Forest Classifier
rF = RandomForestClassifier(random_state=108)

# Hyperparameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),  
    'max_depth': randint(5, 20),       
    'min_samples_split': randint(2, 20), 
    'min_samples_leaf': randint(1, 20),   
    'bootstrap': [True, False]          
}

# Tuning
random_search = RandomizedSearchCV(estimator=rF, param_distributions =param_dist, 
                                   n_iter=100, cv=3, random_state=108, n_jobs=-1, verbose=2, scoring='accuracy')
random_search.fit(X_train, y_train)
bestRf = random_search.best_estimator_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [6]:
random_search.fit(X_train, y_train)

# Best model 
bestRf = random_search.best_estimator_

# k-fold Cross-Validation on training data
kF = KFold(n_splits=5, shuffle=True, random_state=42)
score = cross_val_score(bestRf, X_train, y_train, cv=kF, scoring='accuracy')
print(f'Cross-validated Accuracy on Training Set: {score.mean():.4f}')


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Cross-validated Accuracy on Training Set: 0.9473


# Model Evaluation

Let's estimate how accurately the classifier or model can predict the breast cancer of patients.

1. **Accuracy**: It is the ratio of correct predictions-both true positives and true negatives-among the total number of predictions.

    `Accuracy = (True Positives + True Negatives) / Total Predictions`

2. **Precision**: Precision is defined as the number of true positive predictions divided by all positive predictions of the model.

    `Precision = True Positives / (True Positives + False Positives)`

3. **Recall**: Recall is the ratio of true positive predictions against the sum of all actual positive instances. It is also known as sensitivity or true positive rate.

    `Recall = True Positives / (True Positives + False Negatives)`


4. **F1-score**: The F1-score is the harmonic mean of precision and recall. It is a more balanced measure considering both false positives and false negatives. A higher F1-score indicates a better balance between precision and recall.


    `F1-score = 2 * (Precision * Recall) / (Precision + Recall)`



In [8]:
# Evaluate the model on test data
y_pred = bestRf.predict(X_test)

# Performance Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("\nFinal Model Performance on Test Dataset:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")



Final Model Performance on Test Dataset:
Accuracy: 0.9737
Precision: 0.9722
Recall: 0.9859
F1-score: 0.9790


# Specificity and Sensitivity

**Sensitivity *(True Positive Rate)***: Sensitivity measures the proportion of actual positives that are correctly identified by the test. 

Formula: 
    `Sensitivity = TP / (TP + FN)`
    
    Where:
    - TP = True Positives
    - FN = False Negatives

**Specificity *(True Negative Rate)***: Specificity measures the proportion of actual negatives that are correctly identified by the test. 

Formula:
    `Specificity = TN / (TN + FP)`
    
    Where:
    - TN = True Negatives
    - FP = False Positives


In [9]:
# True Positives, True Negatives, False Positives, and False Negatives
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Sensitivity 
sensitivity = tp / (tp + fn)

# Specificity
specificity = tn / (tn + fp)

print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")

Sensitivity: 0.9859
Specificity: 0.9535
[CV] END bootstrap=True, max_depth=12, min_samples_leaf=9, min_samples_split=17, n_estimators=173; total time=   0.5s
[CV] END bootstrap=False, max_depth=7, min_samples_leaf=12, min_samples_split=11, n_estimators=143; total time=   0.4s
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=13, min_samples_split=14, n_estimators=160; total time=   0.5s
[CV] END bootstrap=False, max_depth=13, min_samples_leaf=5, min_samples_split=18, n_estimators=175; total time=   0.5s
[CV] END bootstrap=False, max_depth=6, min_samples_leaf=9, min_samples_split=6, n_estimators=125; total time=   0.3s
[CV] END bootstrap=False, max_depth=16, min_samples_leaf=18, min_samples_split=10, n_estimators=175; total time=   0.5s
[CV] END bootstrap=True, max_depth=11, min_samples_leaf=13, min_samples_split=12, n_estimators=81; total time=   0.3s
[CV] END bootstrap=True, max_depth=18, min_samples_leaf=16, min_samples_split=12, n_estimators=166; total time=   0.5s
[CV] END b