In [122]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()        

In [123]:
print(breast_cancer.DESCR)  #for show datasets

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

# Dataset Info

In [124]:
breast_cancer.target[500] #check sample - 0 = - and 1 = +

np.int64(1)

In [125]:
breast_cancer.target.shape #show  Instances

(569,)

In [126]:
breast_cancer.data[500]  #show data people

array([1.504e+01, 1.674e+01, 9.873e+01, 6.894e+02, 9.883e-02, 1.364e-01,
       7.721e-02, 6.142e-02, 1.668e-01, 6.869e-02, 3.720e-01, 8.423e-01,
       2.304e+00, 3.484e+01, 4.123e-03, 1.819e-02, 1.996e-02, 1.004e-02,
       1.055e-02, 3.237e-03, 1.676e+01, 2.043e+01, 1.097e+02, 8.569e+02,
       1.135e-01, 2.176e-01, 1.856e-01, 1.018e-01, 2.177e-01, 8.549e-02])

In [127]:
breast_cancer.data.shape # Instances & Feature 

(569, 30)

# Preprocessing

In [128]:
from sklearn.model_selection import train_test_split
                                                  #Split dataset into 80% training and 20% testing sets
Feature_train,Feature_test,Label_train,Label_test = train_test_split(breast_cancer.data,breast_cancer.target,test_size=0.2)
print(f"Feature : train {Feature_train.shape} | test {Feature_test.shape}")
print(f"Label   : train {Label_train.shape}    | test {Label_test.shape}")

Feature : train (455, 30) | test (114, 30)
Label   : train (455,)    | test (114,)


In [129]:
Feature_train[0]

array([9.742e+00, 1.912e+01, 6.193e+01, 2.897e+02, 1.075e-01, 8.333e-02,
       8.934e-03, 1.967e-02, 2.538e-01, 7.029e-02, 6.965e-01, 1.747e+00,
       4.607e+00, 4.352e+01, 1.307e-02, 1.885e-02, 6.021e-03, 1.052e-02,
       3.100e-02, 4.225e-03, 1.121e+01, 2.317e+01, 7.179e+01, 3.809e+02,
       1.398e-01, 1.352e-01, 2.085e-02, 4.589e-02, 3.196e-01, 8.009e-02])

In [130]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1)) # Normalizes data to [0, 1] range
Feature_train = scaler.fit_transform(Feature_train)
Feature_test = scaler.transform(Feature_test)

In [131]:
Feature_train[0]
# Normalizes data OK!

array([0.13067348, 0.31822793, 0.12535416, 0.0620675 , 0.49535073,
       0.19615361, 0.02095216, 0.10282279, 0.74646465, 0.45117621,
       0.21184139, 0.30648868, 0.18140696, 0.07077514, 0.59657509,
       0.12465828, 0.01981896, 0.25721271, 0.32529408, 0.14946736,
       0.13015873, 0.29717484, 0.11951479, 0.06418919, 0.45321271,
       0.10469482, 0.01665335, 0.15769759, 0.40846481, 0.16430539])

# Classification    


In [132]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test):
    """
    Calculate and print evaluation metrics for classification model
    
    Args:
        Label_train: Actual labels of training set
        Label_test: Actual labels of test set
        Label_pred_train: Predicted labels for training set
        Label_pred_test: Predicted labels for test set
    
    Returns:
        Tuple of (train_accuracy, test_accuracy, precision, recall)
    """
    
    # Calculate accuracy for both train and test sets
    acc_train = accuracy_score(y_true=Label_train, y_pred=Label_pred_train)  # (TP + TN) / Total
    acc_test = accuracy_score(y_true=Label_test, y_pred=Label_pred_test)    # (TP + TN) / Total
    
    # Calculate precision and recall for test set only
    precision = precision_score(y_true=Label_test, y_pred=Label_pred_test)  # TP / (TP + FP)
    recall = recall_score(y_true=Label_test, y_pred=Label_pred_test)        # TP / (TP + FN)
    
    # Print formatted results
    print(f"Train Accuracy: {acc_train:.6f}  , Test Accuracy: {acc_test:.6f}\n"
          f"Precision:      {precision:.6f}  , Recall:        {recall:.6f}")
    
    return acc_train, acc_test, precision, recall

## 1. naive bayes

In [133]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(Feature_train , Label_train)


In [134]:
# Make predictions using trained GaussianNB model
Label_pred_train = gnb.predict(Feature_train)
Label_pred_test = gnb.predict(Feature_test)

# Evaluate model performance metrics
acc_train_gnb, acc_test_gnb, precision_gnb, recall_gnb = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.945055  , Test Accuracy: 0.885965
Precision:      0.891892  , Recall:        0.929577


## 2. KNN

In [135]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree',leaf_size=28)
knn.fit(Feature_train , Label_train)


In [136]:
# Make predictions using trained KNN model
Label_pred_train = knn.predict(Feature_train)
Label_pred_test = knn.predict(Feature_test)

# Evaluate model performance metrics
acc_train_knn, acc_test_knn, precision_knn, recall_knn = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.971429  , Test Accuracy: 0.956140
Precision:      0.958333  , Recall:        0.971831


## 3. decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=128 , min_samples_split=8 , criterion='entropy')
dt.fit(Feature_train , Label_train)

In [155]:
# Make predictions using trained decision tree model
Label_pred_train = dt.predict(Feature_train)
Label_pred_test = dt.predict(Feature_test)

# Evaluate model performance metrics
acc_train_dt, acc_test_dt, precision_dt, recall_dt = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.991209  , Test Accuracy: 0.877193
Precision:      0.925373  , Recall:        0.873239
