# 1. Libraries & Datasets

In [36]:
import os

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from scipy.stats import zscore

from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import dill
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
df_cancer = load_breast_cancer(as_frame=True)

RANDOM_STATE = 11
ID_LABELS = ['ID']
TARGET_LABEL = "target"


TRAIN_FRAC = 0.8
TEST_FRAC = 0.2
Z_SCORE_THRESHOLD = 3.5
CAP_PERCENTILE_THRESHOLD = 0.99
FLOOR_PERCENTILE_THRESHOLD = 0.01

# Optimal Hyperparameters
best_C = 100 
best_penalty = 'l2'

# Define the features want to keep
KEEP_FEATURES = ["worst perimeter", "worst concave points", "worst symmetry"]

In [37]:
df_extraction = pd.concat([df_cancer['data'],df_cancer['target']],axis=1)
df_extraction.insert(0, 'ID', range(1, len(df_extraction) + 1))
df_extraction

Unnamed: 0,ID,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,2,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,3,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,4,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,5,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,565,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,566,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,567,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,568,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


# 2. Split data to train and test

In [38]:
df_extraction_train, df_extraction_test = train_test_split(df_extraction, test_size=TEST_FRAC, random_state=RANDOM_STATE)
print(f"Extraction size = {df_extraction.shape}. Target rate = {df_extraction[TARGET_LABEL].mean()}.")
print(f"Train size = {df_extraction_train.shape}. Target rate = {df_extraction_train[TARGET_LABEL].mean()}.")
print(f"Test size = {df_extraction_test.shape}. Target rate = {df_extraction_test[TARGET_LABEL].mean()}")

Extraction size = (569, 32). Target rate = 0.6274165202108963.
Train size = (455, 32). Target rate = 0.6175824175824176.
Test size = (114, 32). Target rate = 0.6666666666666666


# 3. Develop model (Logistic with 3 variables) pipeline

## 3.1. Custom transformers

In [39]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit(self, X, y=None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y=None):
        return X[self.feature_names]

In [40]:
# Custom transformer for outlier treatment
class OutlierTreater(BaseEstimator, TransformerMixin):
    def __init__(self, features, 
                 cap_threshold=CAP_PERCENTILE_THRESHOLD, 
                 floor_threshold=FLOOR_PERCENTILE_THRESHOLD, 
                 z_score_threshold=Z_SCORE_THRESHOLD):
        self.features = features
        self.cap_threshold = cap_threshold
        self.floor_threshold = floor_threshold
        self.z_score_threshold = z_score_threshold
        
    def fit(self, X, y=None):
        self.caps_ = {}
        self.floors_ = {}
        for feature_idx, feature in enumerate(self.features):
            z_scores = zscore(X[:, feature_idx])
            max_z_score = np.max(np.abs(z_scores))
            if max_z_score >= self.z_score_threshold:
                # If the maximum absolute z-score is above or equal to the threshold, exclude this feature
                continue
            non_outliers_values = X[:, feature_idx]
            self.caps_[feature] = np.quantile(non_outliers_values, self.cap_threshold)
            self.floors_[feature] = np.quantile(non_outliers_values, self.floor_threshold)
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for feature_idx, feature in enumerate(self.features):
            if feature in self.caps_ and feature in self.floors_:
                X_transformed[:, feature_idx] = np.clip(X_transformed[:, feature_idx], self.floors_[feature], self.caps_[feature])
        return X_transformed


## 3.2. Create pipeline

In [42]:
# Create a pipeline with scaling and logistic regression, keeping only specified features
processing_pipeline = Pipeline([
    ('selector', FeatureSelector(feature_names=KEEP_FEATURES)), 
    ('imputer', SimpleImputer(strategy="mean")),
    ('outlier_treater', OutlierTreater(features=KEEP_FEATURES,
                                       cap_threshold=CAP_PERCENTILE_THRESHOLD,
                                       floor_threshold=FLOOR_PERCENTILE_THRESHOLD,
                                       z_score_threshold=Z_SCORE_THRESHOLD)),

    ('logistic_regression', LogisticRegression(penalty=best_penalty, C=best_C))  
])

full_pipeline = Pipeline(steps=[("processing_pipeline", processing_pipeline)])



# Save untrained object
with open("untrained_model_pipeline.dill", "wb") as file:
    dill.dump(full_pipeline, file)

## 3.3. Train pipeline

In [43]:
# Load untrained object
with open("untrained_model_pipeline.dill", "rb") as file:
    full_pipeline_trained = dill.load(file)
    
    
x_train, y_train = df_extraction_train.drop(columns=["ID", "target"]), df_extraction_train["target"].copy()
x_test, y_test = df_extraction_test.drop(columns=["ID", "target"]), df_extraction_test["target"].copy()

# Train the pipeline on training set
full_pipeline_trained.fit(x_train, y_train)

## 3.4. Train saved pipline

In [44]:
with open("trained_model_pipeline.dill", "wb") as file:
    dill.dump(full_pipeline_trained, file)

# 4. Use model pipeline¶to see model performance

In [45]:
with open("trained_model_pipeline.dill", "rb") as file:
    model_pipeline = dill.load(file)
    
y_pred = full_pipeline_trained.predict(x_test)

# Calculate recall, precision, and AUROC
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auroc = roc_auc_score(y_test, full_pipeline_trained.predict_proba(x_test)[:, 1])

print("Recall:", recall)
print("Precision:", precision)
print("AUROC:", auroc)

Recall: 0.9736842105263158
Precision: 0.9736842105263158
AUROC: 0.9979224376731302
