In [226]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()        

In [227]:
print(breast_cancer.DESCR)  #for show datasets

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

# Dataset Info

In [228]:
breast_cancer.target[500] #check sample - 0 = - and 1 = +

np.int64(1)

In [229]:
breast_cancer.target.shape #show  Instances

(569,)

In [230]:
breast_cancer.data[500]  #show data people

array([1.504e+01, 1.674e+01, 9.873e+01, 6.894e+02, 9.883e-02, 1.364e-01,
       7.721e-02, 6.142e-02, 1.668e-01, 6.869e-02, 3.720e-01, 8.423e-01,
       2.304e+00, 3.484e+01, 4.123e-03, 1.819e-02, 1.996e-02, 1.004e-02,
       1.055e-02, 3.237e-03, 1.676e+01, 2.043e+01, 1.097e+02, 8.569e+02,
       1.135e-01, 2.176e-01, 1.856e-01, 1.018e-01, 2.177e-01, 8.549e-02])

In [231]:
breast_cancer.data.shape # Instances & Feature 

(569, 30)

# Preprocessing

In [232]:
from sklearn.model_selection import train_test_split
                                                  #Split dataset into 80% training and 20% testing sets
Feature_train,Feature_test,Label_train,Label_test = train_test_split(breast_cancer.data,breast_cancer.target,test_size=0.2)
print(f"Feature : train {Feature_train.shape} | test {Feature_test.shape}")
print(f"Label   : train {Label_train.shape}    | test {Label_test.shape}")

Feature : train (455, 30) | test (114, 30)
Label   : train (455,)    | test (114,)


In [233]:
Feature_train[0]

array([1.236e+01, 2.180e+01, 7.978e+01, 4.661e+02, 8.772e-02, 9.445e-02,
       6.015e-02, 3.745e-02, 1.930e-01, 6.404e-02, 2.978e-01, 1.502e+00,
       2.203e+00, 2.095e+01, 7.112e-03, 2.493e-02, 2.703e-02, 1.293e-02,
       1.958e-02, 4.463e-03, 1.383e+01, 3.050e+01, 9.146e+01, 5.747e+02,
       1.304e-01, 2.463e-01, 2.434e-01, 1.205e-01, 2.972e-01, 9.261e-02])

In [234]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1)) # Normalizes data to [0, 1] range
Feature_train = scaler.fit_transform(Feature_train)
Feature_test = scaler.transform(Feature_test)

In [235]:
Feature_train[0]
# Normalizes data OK!

array([0.25457901, 0.40886033, 0.24870431, 0.13683987, 0.31678252,
       0.23026195, 0.14093252, 0.1861332 , 0.43939394, 0.29654591,
       0.06746334, 0.25234264, 0.06813363, 0.0264252 , 0.24975714,
       0.17032175, 0.06825758, 0.24493275, 0.16460292, 0.1210424 ,
       0.20988972, 0.49253731, 0.20444245, 0.09572847, 0.39113782,
       0.24050911, 0.19440895, 0.41408935, 0.27735068, 0.3184978 ])

# Classification    


In [236]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

def calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test):
    """
    Calculate and print evaluation metrics for classification model
    
    Args:
        Label_train: Actual labels of training set
        Label_test: Actual labels of test set
        Label_pred_train: Predicted labels for training set
        Label_pred_test: Predicted labels for test set
    
    Returns:
        Tuple of (train_accuracy, test_accuracy, precision, recall)
    """
    
    # Calculate accuracy for both train and test sets
    acc_train = accuracy_score(y_true=Label_train, y_pred=Label_pred_train)  # (TP + TN) / Total
    acc_test = accuracy_score(y_true=Label_test, y_pred=Label_pred_test)    # (TP + TN) / Total
    
    # Calculate precision and recall for test set only
    precision = precision_score(y_true=Label_test, y_pred=Label_pred_test)  # TP / (TP + FP)
    recall = recall_score(y_true=Label_test, y_pred=Label_pred_test)        # TP / (TP + FN)
    
    # Print formatted results
    print(f"Train Accuracy: {acc_train:.6f}  , Test Accuracy: {acc_test:.6f}\n"
          f"Precision:      {precision:.6f}  , Recall:        {recall:.6f}")
    
    return acc_train, acc_test, precision, recall

## 1. naive bayes

In [237]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(Feature_train , Label_train)


In [238]:
# Make predictions using trained GaussianNB model
Label_pred_train = gnb.predict(Feature_train)
Label_pred_test = gnb.predict(Feature_test)

# Evaluate model performance metrics
acc_train_gnb, acc_test_gnb, precision_gnb, recall_gnb = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.949451  , Test Accuracy: 0.929825
Precision:      0.934211  , Recall:        0.959459


## 2. KNN

In [239]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree',leaf_size=28)
knn.fit(Feature_train , Label_train)


In [240]:
# Make predictions using trained KNN model
Label_pred_train = knn.predict(Feature_train)
Label_pred_test = knn.predict(Feature_test)

# Evaluate model performance metrics
acc_train_knn, acc_test_knn, precision_knn, recall_knn = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.978022  , Test Accuracy: 0.947368
Precision:      0.947368  , Recall:        0.972973


## 3. decision tree

In [241]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=128 , min_samples_split=8 , criterion='entropy')
dt.fit(Feature_train , Label_train)

In [242]:
# Make predictions using trained decision tree model
Label_pred_train = dt.predict(Feature_train)
Label_pred_test = dt.predict(Feature_test)

# Evaluate model performance metrics
acc_train_dt, acc_test_dt, precision_dt, recall_dt = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.989011  , Test Accuracy: 0.885965
Precision:      0.942029  , Recall:        0.878378


## 4.random forest

In [243]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000 , max_depth=512 , min_samples_split=4)
rf.fit(Feature_train , Label_train)

In [244]:
# Generate predictions using trained Random Forest model
Label_pred_train = rf.predict(Feature_train)
Label_pred_test = rf.predict(Feature_test)

acc_train_rf, acc_test_rf, precision_rf, recall_rf = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 1.000000  , Test Accuracy: 0.921053
Precision:      0.957746  , Recall:        0.918919


## 5.SVM

In [245]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(Feature_train , Label_train)


In [246]:
# Generate predictions using trained SVM model
Label_pred_train = svm.predict(Feature_train)
Label_pred_test = svm.predict(Feature_test)

acc_train_svm, acc_test_svm, precision_svm, recall_svm = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.984615  , Test Accuracy: 0.982456
Precision:      0.973684  , Recall:        1.000000


## 6. Logistic regression

In [250]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(Feature_train , Label_train)

In [252]:
# Generate predictions using trained Logistic Regression model
Label_pred_train = lr.predict(Feature_train)
Label_pred_test = lr.predict(Feature_test)

acc_train_lr, acc_test_lr, precision_lr, recall_lr = calculate_metrics(Label_train, Label_test, Label_pred_train, Label_pred_test)

Train Accuracy: 0.971429  , Test Accuracy: 0.973684
Precision:      0.961039  , Recall:        1.000000
