# Loading Data

In [19]:
# Importing Pandas and Numpy
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

# Loading breast cancer dataset from Sklearn
data = load_breast_cancer(as_frame=True).frame

# Selecting just a few columns
df_base_table = data.iloc[:, :10].copy()

# Adding Target column
df_base_table['target'] = data['target']

df_base_table.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0


In [20]:
df_base_table['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [21]:
df_base_table['target'].value_counts(normalize=True)

target
1    0.627417
0    0.372583
Name: proportion, dtype: float64

# Adding Categorical Columns

In [22]:
# List of cities to use create city column
cities_list = ['sao paulo', 'rio de janeiro', 'brasilia', 'salvador', 'fortaleza' ,'curitiba', 'cuiaba', 'goiania', 'manaus', 'palmas', 'vitoria', 'aracaju']

# Setting seed for reproducibility
rng = np.random.default_rng(seed=42)

# Create city column assigning cities to each row
df_base_table['city'] = rng.choice(cities_list, size=df_base_table.shape[0])

# Creating column of tumor size (categorical)
df_base_table['tumor size'] = rng.choice(['large', 'medium', 'small'], size=df_base_table.shape[0])

df_base_table.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,target,city,tumor size
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,0,rio de janeiro,medium
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0,palmas,small
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0,goiania,medium
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0,curitiba,large
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0,curitiba,large


# Train-Test Split

In [23]:
# Importing train_test_split function
from sklearn.model_selection import train_test_split

# Splitting 
X = df_base_table.drop(columns=['target'])
y = df_base_table['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Building Pipeline steps

In [24]:
# ================================================================
# Step 1. Importing libraries
# ================================================================
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.imputation import ArbitraryNumberImputer, CategoricalImputer
from feature_engine.selection import DropFeatures, DropCorrelatedFeatures
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder

# ================================================================
# Step 2. Separate columns to apply different preprocesses
# ================================================================
# Columns to drop
cols_to_drop = [
    'mean fractal dimension'
]

# 2. Columns to fillna = 0
cols_fillna_0 = [
    'mean symmetry'
]

# Columns to fillna = 1
cols_fillna_1 = [
    'mean radius'
]

# Columns to fillna = -1
cols_fillna_neg999 = [
    'mean perimeter'
]

# Columns to fillna = "missing"
cols_fillna_missing = [
    'city',
    'tumor size',
]

# Columns to apply OneHotEncoding
cols_ohe = [
    'tumor size'
]

# Columns to apply FrequencyEncoder
cols_freq_encode = [
    'city'
]

# Columns to rescale with MinMaxScaler
cols_minmax = [
    'mean radius',
    'mean texture',
    'mean perimeter',
    'mean area',
]

# Columns to apply StandardScaler
cols_std_scaler = [
    'mean smoothness',
    'mean compactness',
    'mean concavity',
    'mean concave points',
    'mean symmetry',
]

# ==============================================
# Step 3. Building Pipeline with preprocess steps, oversample and model
# ==============================================
preprocess_steps = [
    ('drop_cols', DropFeatures(features_to_drop=cols_to_drop)),
    ('fillna_0', ArbitraryNumberImputer(variables=cols_fillna_0, arbitrary_number=0)),
    ('fillna_1', ArbitraryNumberImputer(variables=cols_fillna_1, arbitrary_number=1)),
    ('fillna_neg999', ArbitraryNumberImputer(variables=cols_fillna_neg999, arbitrary_number=-999)),
    ('fillna_missing', CategoricalImputer(variables=cols_fillna_missing, fill_value='missing')),
    ('ohe', OneHotEncoder(variables=cols_ohe)),
    ('freq_encode', CountFrequencyEncoder(variables=cols_freq_encode, encoding_method='frequency')),
    ('minmax_scaler', SklearnTransformerWrapper(MinMaxScaler(), variables=cols_minmax)),
    ('std_scaler', SklearnTransformerWrapper(StandardScaler(), variables=cols_std_scaler)),
    ('drop_corr_feat', DropCorrelatedFeatures(method='pearson', threshold=0.8)),
    ('feature_selection', SklearnTransformerWrapper(SelectKBest(k=5))),
    ('oversampling', RandomOverSampler(sampling_strategy='minority', random_state=42)),
    ('decision_tree', DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=42))
]

# ==============================================
# Step 4. Building Pipeline with preprocess steps, oversample and model
# ==============================================
complex_pipeline = Pipeline(steps=preprocess_steps)

In [25]:
complex_pipeline

In [26]:
# Training the model with all the steps of the pipeline
complex_pipeline.fit(X_train, y_train)

# Making predictions
y_pred = complex_pipeline.predict(X_test)
y_pred_proba = complex_pipeline.predict_proba(X_test)

# Checking the transformed DataFrame

In [27]:
# Transforming and resampling the data
X_train_resampled, y_train_resampled = complex_pipeline[:-1].fit_resample(X_train, y_train)

In [28]:
# Transformed and Resampled Dataframe
X_train_resampled

Unnamed: 0,mean radius,mean texture,mean smoothness,mean compactness,mean symmetry
0,0.158029,0.224552,-0.135940,-1.008718,0.281062
1,0.624686,0.332432,1.274468,0.842288,-0.293045
2,0.174121,0.183970,-0.613515,-1.138154,0.434395
3,0.311373,0.141698,0.664482,0.286762,0.555635
4,0.207724,0.310450,-0.672282,-1.006099,0.737495
...,...,...,...,...,...
565,0.643618,0.420358,-0.391837,0.578554,-0.146844
566,0.284869,0.409537,2.315909,1.670902,1.939198
567,0.550381,0.356442,-0.131477,0.051084,-0.389324
568,0.606228,0.521136,0.396683,1.029334,1.411447


In [29]:
# Checking Resample
y_train_resampled.value_counts()

target
1    285
0    285
Name: count, dtype: int64

# Cross-Validation with Pipeline

In [30]:
# Importing Library
from sklearn.model_selection import cross_validate

# Calculating Cross-Validation metrics
cv_metrics = cross_validate(
    estimator=complex_pipeline,
    X=X_train,
    y=y_train,
    scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
    cv=5,
)

# Construction DataFrame of the averages of the metrics from Cross-Validation
df_cv_metrics = pd.DataFrame({
    'accuracy': [cv_metrics['test_accuracy'].mean()],
    'precision': [cv_metrics['test_precision'].mean()],
    'recall': [cv_metrics['test_recall'].mean()],
    'f1': [cv_metrics['test_f1'].mean()],
    'roc_auc': [cv_metrics['test_roc_auc'].mean()],
})

df_cv_metrics

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
0,0.896703,0.924035,0.912281,0.917096,0.933953


# Fine-Tunning

In [31]:
# Importing RandomizedSearchCV from Sklearn
from sklearn.model_selection import RandomizedSearchCV

# Dict with parameters distribution
param_distributions_dict = {
    'feature_selection__transformer__k': [5, 7, 9],
    'decision_tree__max_depth': [3, 5, 7],
    'decision_tree__min_samples_split': [2, 5, 10],
    'decision_tree__min_samples_leaf': [1, 5, 10]
    }

# Instantiating random_search object
random_search = RandomizedSearchCV(
    estimator=complex_pipeline,
    param_distributions=param_distributions_dict,
    scoring='f1',
    cv=5,
    random_state=42,
    verbose=2
)

# Fine tunning
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END decision_tree__max_depth=5, decision_tree__min_samples_leaf=1, decision_tree__min_samples_split=5, feature_selection__transformer__k=5; total time=   0.0s
[CV] END decision_tree__max_depth=5, decision_tree__min_samples_leaf=1, decision_tree__min_samples_split=5, feature_selection__transformer__k=5; total time=   0.0s
[CV] END decision_tree__max_depth=5, decision_tree__min_samples_leaf=1, decision_tree__min_samples_split=5, feature_selection__transformer__k=5; total time=   0.0s
[CV] END decision_tree__max_depth=5, decision_tree__min_samples_leaf=1, decision_tree__min_samples_split=5, feature_selection__transformer__k=5; total time=   0.0s
[CV] END decision_tree__max_depth=5, decision_tree__min_samples_leaf=1, decision_tree__min_samples_split=5, feature_selection__transformer__k=5; total time=   0.0s
[CV] END decision_tree__max_depth=3, decision_tree__min_samples_leaf=1, decision_tree__min_samples_split=2, feature_sel

In [32]:
random_search.best_estimator_

In [33]:
random_search.best_estimator_.predict(X_test)

array([0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1])

In [34]:
tunned_pipeline = random_search.best_estimator_
tunned_pipeline