In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

In [2]:
# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
# Create a DataFrame for easier manipulation
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

# Perform EDA
print("Basic Information:")
print(df.info())

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothn

In [4]:
# looking at the data
print(data['target_names'])
print(data['target'])

['malignant' 'benign']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Initialize the StandardScaler (Z-score standardizer)
scaler = StandardScaler()

In [7]:
# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Create a DataFrame for easier manipulation of the scaled training data
df_train_scaled = pd.DataFrame(X_train_scaled, columns=data.feature_names)
df_train_scaled['target'] = y_train

In [9]:
# Compute Pearson correlation coefficients on the scaled training data
correlations = df_train_scaled.corr()

In [10]:
# Extract correlation coefficients with the target
correlation_with_target = correlations['target'].drop('target')

In [11]:
# Print the correlation coefficients
print("Correlation coefficients with the target:")
print(correlation_with_target)

Correlation coefficients with the target:
mean radius               -0.718073
mean texture              -0.416213
mean perimeter            -0.731859
mean area                 -0.695171
mean smoothness           -0.375137
mean compactness          -0.590554
mean concavity            -0.683262
mean concave points       -0.778115
mean symmetry             -0.348123
mean fractal dimension     0.014308
radius error              -0.540126
texture error              0.003226
perimeter error           -0.529211
area error                -0.517064
smoothness error           0.058123
compactness error         -0.254370
concavity error           -0.216047
concave points error      -0.380240
symmetry error            -0.004723
fractal dimension error   -0.041727
worst radius              -0.766527
worst texture             -0.467479
worst perimeter           -0.774998
worst area                -0.722875
worst smoothness          -0.430364
worst compactness         -0.588884
worst concavity       

In [12]:
# Select the top 15 features based on the absolute value of correlation coefficients
top_15_features = correlation_with_target.abs().sort_values(ascending=False).head(20).index

In [13]:
# Create new datasets with the selected features
X_train_selected = df_train_scaled[top_15_features].values
X_test_selected = pd.DataFrame(X_test_scaled, columns=data.feature_names)[top_15_features].values

In [14]:
# Initialize the XGBoost classifier
xgb_model = XGBClassifier()

In [15]:
# Fit the classifier on the training data
xgb_model.fit(X_train_selected, y_train)

In [16]:
# Make predictions on the testing data
y_pred = xgb_model.predict(X_test_selected)

In [17]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [18]:
# Print evaluation metrics and confusion matrix
print("Selected features:", top_15_features)

Selected features: Index(['worst concave points', 'mean concave points', 'worst perimeter',
       'worst radius', 'mean perimeter', 'worst area', 'mean radius',
       'mean area', 'mean concavity', 'worst concavity', 'mean compactness',
       'worst compactness', 'radius error', 'perimeter error', 'area error',
       'worst texture', 'worst symmetry', 'worst smoothness', 'mean texture',
       'concave points error'],
      dtype='object')


In [19]:
print(f"Training data shape: {X_train_selected.shape}")
print(f"Testing data shape: {X_test_selected.shape}")

Training data shape: (455, 20)
Testing data shape: (114, 20)


In [20]:
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Accuracy: 0.9649122807017544
Recall: 0.9859154929577465
Precision: 0.958904109589041
F1 Score: 0.9722222222222222


In [21]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [22]:
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[40  3]
 [ 1 70]]
