In [None]:
from sklearn import set_config
set_config(display = 'diagram')
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import plotly.express as px
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection  import train_test_split , StratifiedKFold , RandomizedSearchCV
from  sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score , accuracy_score , precision_score , f1_score , recall_score
import warnings
warnings.filterwarnings('ignore')


In [None]:
print("Well be working on Breast Caner prediciton dataset : ")

In [None]:
data = pd.read_csv("/Users/adityaverma/Downloads/breast-cancer.csv")
df = pd.DataFrame(data)
df

In [None]:
print('''   

        Breast Cancer Feature Reference Guide
=====================================

Column Name             | Description
------------------------|-----------------------------------------
radius_mean             | Radius of lobes (mean)
texture_mean            | Surface texture (mean)
perimeter_mean          | Outer perimeter of lobes (mean)
area_mean               | Area of lobes (mean)
smoothness_mean         | Smoothness levels (mean)
compactness_mean        | Compactness (mean)
concavity_mean          | Concavity (mean)
concave points_mean     | Concave points (mean)
symmetry_mean           | Symmetry (mean)
fractal_dimension_mean  | Fractal dimension (mean)

radius_se               | Standard error of radius
texture_se              | Standard error of texture
perimeter_se            | Standard error of perimeter
area_se                 | Standard error of area
smoothness_se           | Standard error of smoothness
compactness_se          | Standard error of compactness
concavity_se            | Standard error of concavity
concave points_se       | Standard error of concave points
symmetry_se             | Standard error of symmetry
fractal_dimension_se    | Standard error of fractal dimension

radius_worst            | Worst (largest) radius
texture_worst           | Worst texture
perimeter_worst         | Worst perimeter
area_worst              | Worst area
smoothness_worst        | Worst smoothness
compactness_worst       | Worst compactness
concavity_worst         | Worst concavity
concave points_worst    | Worst concave points
symmetry_worst          | Worst symmetry
fractal_dimension_worst | Worst fractal dimension



''')

## ü©∫ **Breast Cancer Overview**

Breast cancer is the **most common cancer among women worldwide**.  
It accounts for **~25%** of all cancer cases and affected over **2.1 million people (2015)**.

It begins when breast cells grow **uncontrollably**, forming tumors that can be:
- Detected on **X-rays**, or  
- Felt as **lumps** in the breast

---

## üéØ **Detection Challenge**

A major challenge in breast cancer diagnosis is **classifying tumors** into:

- **Malignant (M)** ‚Äî `cancerous`  
- **Benign (B)** ‚Äî `non-cancerous`  

Accurate classification helps determine:
- Treatment paths  
- Early intervention  
- Survival outcomes  

## ü§ñ **Objective**

We aim to **build a machine learning model using boosting algorihtms for better accuracy:** to classify tumors using the  
**Breast Cancer Wisconsin (Diagnostic) Dataset**.

This dataset includes numeric features extracted from digitized images of fine needle aspirates (FNA) of breast masses.

---

## üìå **Target Variable**

- `diagnosis`  
  - **M** ‚Üí Malignant  
  - **B** ‚Üí Benign  


In [None]:
df1 = df.copy

In [None]:
print(f"The shape of the dataset is : {df.shape[0]} rows and {df.shape[1]} columns")

In [None]:
def summary(df):

    summary_df = pd.DataFrame({
    'Total_Nulls' : df.isnull().sum(),
    'Total_null_percent(%)' : (df.isnull().sum()/len(df))*100,
    'Total.unique' : df.nunique(),
    'dtytpes': df.dtypes
        }
    )
    return summary_df
summary(df)

In [None]:
df.info()

In [None]:
df.describe().round(3)

In [None]:
df.columns


In [None]:
df.drop('id' , axis = 1 , inplace = True)

In [None]:
pink_palette = [
    '#ffe6f0',  
    '#ffbfd6',
    '#ff99bd',
    '#ff73a3',
    '#ff4d8a',
    '#ff2670',
    '#ff0057'   
]

sns.palplot(pink_palette)
plt.title(" Color Palette for the Breast Cancer chart")
plt.show()

In [None]:
df['diagnosis'] = df['diagnosis'].map({'M':1 , 'B':0})

In [None]:
df['diagnosis']

In [None]:

class_bal = [pink_palette[1] , pink_palette[-1]]
class_bal

In [None]:
df_plot = df.copy()
df_plot['diagnosis_label'] = df_plot['diagnosis'].replace({1:'M', 0:'B'})

count_df = df_plot['diagnosis_label'].value_counts().reset_index()
count_df.columns = ['Class', 'Count']

fig = px.bar(
    count_df,
    x='Class',
    y='Count',
    title='Class Balance Ratio',
    color='Class',
    hover_data=None  # hover_data must match count_df columns only
)

fig.show()


In [None]:
df.columns

In [None]:
colors = ['#FFB6C1', '#FF69B4'] 

for charts in df.drop('diagnosis' , axis = 1).columns[:10]:
    fig = px.box( df , 
                 x = 'diagnosis' , 
                 y = charts ,
                 color = 'diagnosis',
                 color_discrete_sequence = colors,
                 title = f"Diagnosis relation for {charts}")
    fig.show()


In [None]:
colors = ['#EBD4FF', '#6A0DAD']

for charts in df.drop('diagnosis' , axis = 1).columns[10:20]:
    fig = px.box( df , 
                 x = 'diagnosis' , 
                 y = charts ,
                 color = 'diagnosis',
                 color_discrete_sequence = colors,
                 title = f"Diagnosis relation for {charts}")
    fig.show()


In [None]:
colors = ['#FFB5A7', '#D00000']


for charts in df.drop('diagnosis' , axis = 1).columns[20:30]:
    fig = px.box( df ,
                 x = 'diagnosis' , 
                 y = charts ,
                 color = 'diagnosis',
                 color_discrete_sequence = colors,
                 title = f"Diagnosis relation for {charts}")
    fig.show()

In [None]:
features = df[['diagnosis' , 'radius_mean','texture_mean','perimeter_mean',	'area_mean','smoothness_mean',	'compactness_mean',	'concavity_mean']]
fea = pd.DataFrame(features)
sns.pairplot(fea , hue = 'diagnosis' , 
             palette  =  class_bal, 
             dropna = True , 
             kind = 'scatter',
            diag_kind = 'hist',
            )
plt.show()

# **Summary**

### From the above analysis of the most important features, we can derive several meaningful insights:

---

### **1) Malignant nuclei show high irregularity and deep indentations**  
Malignant cell nuclei tend to be *less smooth* and display a **significant degree of indentation, jaggedness, and irregularity** along their boundaries.  
Benign nuclei appear smoother, rounder, or oval due to their organized and controlled growth patterns.

---

### **2) Malignancy is not always associated with larger nuclear size**  
Although malignant nuclei often have larger radius, perimeter, and area, the scatterplots reveal noticeable overlap with benign nuclei.  
This means that **even small ‚Äúblobs‚Äù with low concavity or perimeter can still be malignant**, making early detection challenging if relying solely on size.

---

### **3) Malignant nuclei exhibit higher compactness due to shape distortion**  
Higher compactness values in malignant nuclei indicate **less circular, more distorted shapes**, not densely packed tissue.  
This structural irregularity contributes to the harder, uneven texture often associated with malignant regions.

---

### **4) Early-stage malignant nuclei can resemble benign ones**  
In early stages, malignant nuclei may appear with **small radius, low perimeter, and mild texture**, closely mimicking benign characteristics.  
This highlights the importance of analyzing **multiple morphological features**, not just size, since small malignant ‚Äúblobs‚Äù may appear benign before developing more aggressive traits.

---


In [None]:
X = df.drop('diagnosis' , axis = 1)
y = df['diagnosis']


In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , random_state = 42 , stratify = y , train_size = 0.7)

In [None]:
model = XGBClassifier(n_estimators = 100 , max_depth  = 5 , n_jobs = -1 , random_state = 42 , learning_rate = 0.1)
model.fit(X_train , y_train)



In [None]:
pred = model.predict(X_test)

In [None]:
!pip install shap 


In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

shap.summary_plot(shap_values,X_train)

In [None]:
# Checking for individual features and the model's deciion making in even deeper detail :

shap.initjs()

In [None]:
for sample in range(1,10):
    display(shap.force_plot(explainer.expected_value 
                            , shap_values[sample] , 
                            X.iloc[sample])
           )
    
            

In [None]:
# Now that we know which features are the best features we can now finally use a pipleine to auotmate PCA and standard sclaing:



model = XGBClassifier(random_state = 42 ,
                       n_jobs = -1)


In [None]:
param_grid = {
    
    'model__n_estimators': [100, 200, 300 , 500], 
    'model__gamma' : [0 , 0.1 , 0.2 , 0.3],# Number of boosting rounds (trees)
    'model__learning_rate': [0.01, 0.05, 0.1 , 0.2 , 0.3], # Step size shrinkage to prevent overfitting
    'model__max_depth': [3, 5, 7 , 9],           # Maximum depth of a tree
    'model__subsample': [0.5 , 0.6, 0.8, 1.0],     # Fraction of samples used for fitting each tree
    'model__colsample_bytree': [0.5 , 0.6, 0.8, 1.0], # Fraction of features used for fitting each tree
    'model__gamma': [0, 0.1,0.3, 0.5],          # Minimum loss reduction required to make a further partition
    'model__reg_alpha': [0, 0.1, 0.5],       # L1 regularization term on weights
    'model__reg_lambda': [1, 1.5, 2]         # L2 regularization term on weights
}



In [None]:

rand_model = RandomizedSearchCV(estimator = model , 
                                param_distributions = param_grid ,
                                 n_iter = 10 , scoring = 'roc_auc' , 
                                cv = CV , 
                                verbose = 2 , 
                               n_jobs = -1)


In [None]:
rand_model.fit(X_train , y_train)

print(f"The best params are :  {rand_model.best_params_}")
print("-----------------------------")
print(f"The best score is : {rand_model.best_score_}")




In [None]:

opt_model = rand_model.best_estimator_
opt_model

pred_mod = rand_model.predict(X_test)
y_proba = opt_model.predict_proba(X_test)[:, 1]

In [None]:
print(''' Printing the Evaluation metrics for the model: 
----------------------------------------------------------------------------------------------- ''')
def evlauation_metrics(y_proba,pred_mod, y_test):
    print(f" roc auc score : {roc_auc_score( y_test , y_proba):.4f}")
    print(f"acuracy score :{accuracy_score( y_test , pred_mod):.4f}")
    print(f"precision : {precision_score(y_test , pred_mod):.4f}")
    print(f"F1 score : {f1_score(y_test , pred_mod):.4f}")
    print(f"Recall : {recall_score(y_test , pred_mod):.4f}")
    return evlauation_metrics

evlauation_metrics(y_proba,pred_mod, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test , pred_mod)
cm

In [None]:
sns.heatmap(cm , annot = True , fmt = 'd' , color = sns.color_palette(["#F2E6FF", "#A64DFF", "#5A189A"]) )
plt.title("Heatmap for model's performative accuracy over the SHAP'd top features")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}", color='#E6399B', linewidth=3)
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curve - Breast Cancer Classifier")
plt.legend()
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(6,5))
plt.plot(recall, precision, color='#C2185B', linewidth=3)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()
