In [1]:
"""Dataset Choice (Simple & Reliable)

We‚Äôll use a classification dataset (because it‚Äôs easier to demo end-to-end):

‚úÖ Dataset: Breast Cancer Dataset (sklearn)

Why:
Clean
No download issues
Binary classification
Perfect for pipelines

Simulate a data ingestion step, llike in a real ML system.
"""

'Dataset Choice (Simple & Reliable)\n\nWe‚Äôll use a classification dataset (because it‚Äôs easier to demo end-to-end):\n\n‚úÖ Dataset: Breast Cancer Dataset (sklearn)\n\nWhy:\nClean\nNo download issues\nBinary classification\nPerfect for pipelines\n\nSimulate a data ingestion step, llike in a real ML system.\n'

In [2]:
#step 1 - import libraries and load dataset
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer(as_frame=True)
df=data.frame
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
df.shape

(569, 31)

In [4]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [5]:
#step 2 - separate feature X and target y
X=df.drop("target",axis=1)
y=df["target"]

In [7]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32

In [9]:
#step 3 - train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape

((455, 30), (114, 30))

In [11]:
y_train.shape, y_test.shape

((455,), (114,))

In [13]:
# step 4 - build a ML pipeline (scaling + model)
"""Now (pipeline way)
pipeline.fit(X_train, y_train)
pipeline.predict(X_test)


‚úî Clean
‚úî Safe
‚úî Reusable
‚úî Production-ready"""

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

#create pipeline
pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]
)
"""‚ÄúFirst scale the data, then train a Logistic Regression model.‚Äù

Inside the pipeline:

1Ô∏è‚É£ StandardScaler
2Ô∏è‚É£ LogisticRegression

And they will always run in this order."""

# train pipeline
pipeline.fit(X_train, y_train)

"""Real-World Interpretation

In production:

This pipeline object is saved
Loaded later
Used on new incoming data
Same steps, same order, no mistakes

    Fit the pipeline once, use it everywhere.

"""


'Real-World Interpretation\n\nIn production:\n\nThis pipeline object is saved\nLoaded later\nUsed on new incoming data\nSame steps, same order, no mistakes\n\n    Fit the pipeline once, use it everywhere.\n\n'

In [14]:
# step 5 make predictions using pipeline
"""Goal of Step 5

Use the trained pipeline to make predictions on unseen data.

No manual scaling.
No extra steps.
Just predict."""

#step 5.1 predict classes 0/1
y_pred = pipeline.predict(X_test)

"""What this does (VERY EASY):

Behind the scenes, the pipeline automatically:
1Ô∏è‚É£ Scales X_test using the same scaler learned from training data
2Ô∏è‚É£ Feeds scaled data to Logistic Regression
3Ô∏è‚É£ Outputs predictions (0 or 1)"""

y_pred[:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1])

In [15]:
"""Real-World Interpretation

In production:
New patient data comes in
You call:
pipeline.predict(new_data)

You get predictions safely

This is exactly how ML APIs work.

    Predict using the pipeline, not the model alone.

"""

#step 6 evaluate the pipeline

# step 6.1 confusion matrix

from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test, y_pred)
cm



array([[41,  2],
       [ 1, 70]], dtype=int64)

In [16]:
#step 6.2 accuracy
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9736842105263158

In [17]:
#step 6.3 Precision, Recall, F1score
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

"""This shows:

Precision
Recall
F1-score
For both classes (0 and 1)
| Metric    | Meaning                                   |
| --------- | ----------------------------------------- |
| Accuracy  | Overall correctness                       |
| Precision | How many predicted positives were correct |
| Recall    | How many actual positives were found      |
| F1        | Balance of precision & recall             |

"""

              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [19]:
#step 6.4 roc auc
#Measure how well the model separates classes across all thresholds.
from sklearn.metrics import roc_auc_score
y_prob=pipeline.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_prob)
roc_auc

0.99737962659679

In [20]:
#Save pipeline
import joblib

joblib.dump(pipeline, "cancer_pipeline.joblib")

"""This creates a file:

cancer_pipeline.joblib


üëâ This file contains:

Scaler

Logistic Regression model

All learned parameters"""

['cancer_pipeline.joblib']

In [21]:
#Load pipeline
loaded_pipeline = joblib.load("cancer_pipeline.joblib")


In [22]:
#predict using loaded model
loaded_pipeline.predict(X_test[:5])


array([1, 0, 0, 1, 1])