In [60]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()        

In [61]:
print(breast_cancer.DESCR)  #for show datasets

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

# Dataset Info

In [62]:
breast_cancer.target[500] #check sample - 0 = - and 1 = +

np.int64(1)

In [63]:
breast_cancer.target.shape #show  Instances

(569,)

In [64]:
breast_cancer.data[500]  #show data people

array([1.504e+01, 1.674e+01, 9.873e+01, 6.894e+02, 9.883e-02, 1.364e-01,
       7.721e-02, 6.142e-02, 1.668e-01, 6.869e-02, 3.720e-01, 8.423e-01,
       2.304e+00, 3.484e+01, 4.123e-03, 1.819e-02, 1.996e-02, 1.004e-02,
       1.055e-02, 3.237e-03, 1.676e+01, 2.043e+01, 1.097e+02, 8.569e+02,
       1.135e-01, 2.176e-01, 1.856e-01, 1.018e-01, 2.177e-01, 8.549e-02])

In [65]:
breast_cancer.data.shape # Instances & Feature 

(569, 30)

# Preprocessing

In [66]:
from sklearn.model_selection import train_test_split
                                                  #Split dataset into 80% training and 20% testing sets
Feature_train,Feature_test,Label_train,Label_test = train_test_split(breast_cancer.data,breast_cancer.target,test_size=0.2)
print(f"Feature : train {Feature_train.shape} | test {Feature_test.shape}")
print(f"Label   : train {Label_train.shape}    | test {Label_test.shape}")

Feature : train (455, 30) | test (114, 30)
Label   : train (455,)    | test (114,)


In [67]:
Feature_train[0]

array([1.371e+01, 2.083e+01, 9.020e+01, 5.779e+02, 1.189e-01, 1.645e-01,
       9.366e-02, 5.985e-02, 2.196e-01, 7.451e-02, 5.835e-01, 1.377e+00,
       3.856e+00, 5.096e+01, 8.805e-03, 3.029e-02, 2.488e-02, 1.448e-02,
       1.486e-02, 5.412e-03, 1.706e+01, 2.814e+01, 1.106e+02, 8.970e+02,
       1.654e-01, 3.682e-01, 2.678e-01, 1.556e-01, 3.196e-01, 1.151e-01])

In [68]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1)) # Normalizes data to [0, 1] range
Feature_train = scaler.fit_transform(Feature_train)
Feature_test = scaler.transform(Feature_test)

In [69]:
Feature_train[0]
# Normalizes data OK!

array([0.29477447, 0.47306474, 0.30075402, 0.17484768, 0.59826668,
       0.44512607, 0.21944705, 0.29746521, 0.61538462, 0.51705981,
       0.17005003, 0.22471711, 0.14602083, 0.08247696, 0.24108509,
       0.2105777 , 0.17301808, 0.36872931, 0.09818765, 0.20565258,
       0.30633726, 0.4513989 , 0.28524223, 0.16708019, 0.62226771,
       0.33075259, 0.24235294, 0.53599724, 0.32150601, 0.39393939])

# Classification    


In [70]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test):
    """
    Calculate and print evaluation metrics for classification model
    
    Args:
        Label_train: Actual labels of training set
        Label_test: Actual labels of test set
        Label_pred_train: Predicted labels for training set
        Label_pred_test: Predicted labels for test set
    
    Returns:
        Tuple of (train_accuracy, test_accuracy, precision, recall)
    """
    
    # Calculate accuracy for both train and test sets
    acc_train = accuracy_score(y_true=Label_train, y_pred=Label_pred_train)  # (TP + TN) / Total
    acc_test = accuracy_score(y_true=Label_test, y_pred=Label_pred_test)    # (TP + TN) / Total
    
    # Calculate precision and recall for test set only
    precision = precision_score(y_true=Label_test, y_pred=Label_pred_test)  # TP / (TP + FP)
    recall = recall_score(y_true=Label_test, y_pred=Label_pred_test)        # TP / (TP + FN)
    
    # Print formatted results
    print(f"Train Accuracy: {acc_train:.6f}  , Test Accuracy: {acc_test:.6f}\n"
          f"Precision:      {precision:.6f}  , Recall:        {recall:.6f}")
    
    return acc_train, acc_test, precision, recall

## 1. naive bayes

In [71]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(Feature_train , Label_train)


In [73]:
# Make predictions using trained GaussianNB model
Label_pred_train = gnb.predict(Feature_train)
Label_pred_test = gnb.predict(Feature_test)

# Evaluate model performance metrics
acc_train_gnb, acc_test_gnb, precision_gnb, recall_gnb = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.936264  , Test Accuracy: 0.929825
Precision:      0.943662  , Recall:        0.943662
