#### Breast Cancer Machine learning project

##### Importing libraries

In [75]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel, RFE

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score

from xgboost import XGBClassifier


#### Importing dataset into the DataFrame

In [3]:
df = pd.read_csv('data.csv')

#####  Understanding the data set

In [4]:
df. head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
# Cross checking the diagnosis field
df['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

In [6]:
df.shape

(569, 32)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [8]:
df.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


#### Data cleaning

In [9]:
df.isna().sum().sum()

0

In [10]:
df.duplicated().sum()

0

#### EDA

In [16]:
#Determining the correlation between features
df.corr()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,1.0,0.039769,0.074626,0.09977,0.073159,0.096893,-0.012968,9.6e-05,0.05008,0.044158,...,0.082405,0.06472,0.079986,0.107187,0.010338,-0.002968,0.023203,0.035174,-0.044224,-0.029866
diagnosis,0.039769,1.0,0.730029,0.415185,0.742636,0.708984,0.35856,0.596534,0.69636,0.776614,...,0.776454,0.456903,0.782914,0.733825,0.421465,0.590998,0.65961,0.793566,0.416294,0.323872
radius_mean,0.074626,0.730029,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.09977,0.415185,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.073159,0.742636,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.096893,0.708984,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,-0.012968,0.35856,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,9.6e-05,0.596534,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.05008,0.69636,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
concave points_mean,0.044158,0.776614,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661


In [18]:
# Assessing the columns
df.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [15]:
#Encoding the diagnosis field
df.diagnosis =[1 if value == "M" else 0 for value in df.diagnosis]


In [17]:
#Dropping redundant columns
df = df.drop(columns= 'id', axis = 1)

##### Feature and target selection

In [19]:
X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

##### Define Feature Types

In [21]:
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(exclude=np.number).columns


#### Preprocessing Blocks
#### Encoding and Scaling

In [26]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [41]:
# Lasso-based selection
lasso_selector = SelectFromModel(
    LogisticRegression(penalty="l1", solver="liblinear")
)

# Tree-based selection
tree_selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42)
)

#  Recursive Feature Elimination (RFE) (for KNN / SVM). A feature selection technique used to identify the most important features for a model.
rfe_selector = RFE(
    estimator=LogisticRegression(max_iter=500),
    n_features_to_select=10
)


##### Model Pipelines

In [28]:
# Ridge Classifier
ridge_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("feature_selection", lasso_selector),
    ("model", RidgeClassifier())
])

# Lasso (Logistic Regression with L1)
lasso_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
        penalty="l1",
        solver="liblinear"
    ))
])

# Logistic Regression
logreg_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("feature_selection", lasso_selector),
    ("model", LogisticRegression(max_iter=500))
])

# Decision Tree Classifier (Does not need scaling or feature selection)

dt_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

 # KNN  (Scaling + RFE is critical)

knn_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("feature_selection", rfe_selector),
    ("model", KNeighborsClassifier(n_neighbors=5))
])

##  Random Forest
rf_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("feature_selection", tree_selector),
    ("model", RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ))
])

 ## XGBoost
xgb_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", XGBClassifier(
        eval_metric="logloss",
        random_state=42
    ))
])

## Support Vector Machine (SVM)
svm_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("feature_selection", rfe_selector),
    ("model", SVC(probability=True))
])

##### Model training

In [31]:
pipelines = {
    "Ridge": ridge_pipeline,
    "Lasso": lasso_pipeline,
    "Logistic": logreg_pipeline,
    "Decision Tree": dt_pipeline,
    "KNN": knn_pipeline,
    "Random Forest": rf_pipeline,
    "XGBoost": xgb_pipeline,
    "SVM": svm_pipeline
}

In [32]:
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    print(f"{name}: {score:.4f}")

Ridge: 0.9737
Lasso: 0.9737
Logistic: 0.9825
Decision Tree: 0.9298
KNN: 0.9737
Random Forest: 0.9649
XGBoost: 0.9737
SVM: 0.9825


In [47]:
import warnings
warnings.filterwarnings("ignore")

### Getting probabilities from Pipelines

In [48]:
from sklearn.metrics import roc_auc_score, mean_squared_error
results = []

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)

    # Case 1: models with predict_proba
    if hasattr(pipe, "predict_proba"):
        y_score = pipe.predict_proba(X_test)[:, 1]
        rmse = mean_squared_error(y_test, y_score, squared=False)

    # Case 2: models with decision_function only
    else:
        y_score = pipe.decision_function(X_test)
        rmse = np.nan  # RMSE not valid here

    roc_auc = roc_auc_score(y_test, y_score)

    results.append([name, roc_auc, rmse])

results_df = pd.DataFrame(
    results, columns=["Model", "ROC_AUC", "RMSE"]
).sort_values("ROC_AUC", ascending=False)

results_df


Unnamed: 0,Model,ROC_AUC,RMSE
0,Ridge,0.997685,
2,Logistic,0.997024,0.145733
1,Lasso,0.996693,0.148629
7,SVM,0.996032,0.13537
6,XGBoost,0.994048,0.142386
5,Random Forest,0.993056,0.184116
4,KNN,0.98545,0.170654
3,Decision Tree,0.924603,0.264906


##### Feature Importance Comparison

In [69]:
numeric_features = X.columns

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features)
    ]
)


In [70]:
Pipeline([
    ("preprocess", preprocess),
    ("select", SelectFromModel(LogisticRegression(penalty="l1"))),
    ("model", LogisticRegression())
])


In [72]:
# 1. Fit pipeline
logreg_pipeline.fit(X_train, y_train)

# 2. Access fitted objects
preprocessor = logreg_pipeline.named_steps["preprocess"]
selector = logreg_pipeline.named_steps.get("select", None)
model = logreg_pipeline.named_steps["model"]

# 3. Get feature names AFTER fitting
feature_names = preprocessor.get_feature_names_out()

# 4. Handle feature selection (if present)
if selector is not None:
    mask = selector.get_support()
    feature_names = feature_names[mask]

# 5. Build coefficient table
coef_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": model.coef_.ravel()
}).sort_values("Coefficient", key=abs, ascending=False)

coef_df.head(10)


Unnamed: 0,Feature,Coefficient
1,num__radius_se,2.052295
8,num__texture_worst,1.789196
7,num__radius_worst,1.737701
9,num__area_worst,1.64409
0,num__concave points_mean,1.317916
11,num__concavity_worst,0.944979
4,num__compactness_se,-0.846257
12,num__concave points_worst,0.662318
6,num__fractal_dimension_se,-0.643816
13,num__symmetry_worst,0.614678


##### Findings:
Tumor size, growth instability, and boundary irregularity are the strongest predictors of malignancy, while compact, symmetric growth patterns are protective
- Positive coefficient → increases malignancy risk

- Negative coefficient → protective / benign association

##### Cross-Validation with ROC-AUC. Important because single split is not enough

#### Validation

In [74]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = []

for name, pipe in pipelines.items():
    scores = cross_val_score(
        pipe, X_train, y_train,
        scoring="roc_auc",
        cv=cv
    )
    cv_results.append([
        name,
        scores.mean(),
        scores.std()
    ])

cv_df = pd.DataFrame(
    cv_results,
    columns=["Model", "Mean ROC_AUC", "Std"]
).sort_values("Mean ROC_AUC", ascending=False)

cv_df

Unnamed: 0,Model,Mean ROC_AUC,Std
1,Lasso,0.995872,0.004102
2,Logistic,0.99515,0.004312
7,SVM,0.993498,0.005724
6,XGBoost,0.99226,0.004342
0,Ridge,0.991125,0.006221
4,KNN,0.990712,0.008433
5,Random Forest,0.982817,0.010768
3,Decision Tree,0.917234,0.037542


 ##### Model Insight
- Model	Clinical Value
 - Logistic	Explainable, good for screening
 - Random Forest	Captures non-linear tumor patterns
 - XGBoost	Best discrimination, risk stratification
 - SVM	Margin-based separation, strong generalization

### Best overall model (accuracy + stability + interpretability):
 Lasso Regression
- Best cross-validated ROC-AUC
- Very stable
- Excellent test-set performance
- Sparse
- 
Strong alternatives:

- Logistic Regression → almost identical performance

- SVM → best probability calibration

In [83]:
# Convert NumPy array to DataFrame
df = pd.DataFrame(y_score)

# Save to CSV
df.to_csv('test.csv', index=False)
