In [1]:
import warnings
warnings.simplefilter('always', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Pipeline imports
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler,FunctionTransformer
from sklearn import tree


# Read data

I chose to read data directly for ease of testing

In [2]:

diabetes = pd.read_csv('diabetes.csv')
binary_features = ['Obesity', 'TCep', 'Polydipsia', 'Sudden Weight Loss', 'Weakness',
                'Polyphagia', 'Genital Thrush', 'Visual Blurring', 'Itching',
                'Irritability', 'Delayed Healing', 'Partial Paresis', 'Muscle Stiffness', 'Alopecia']
cat_features = ['Race',	'Occupation',	'GP']
num_features = ['Age',	'Height',	'Weight',	'Temperature',	'Urination']


# manually reformat the data. Perhaps do this in the pipeline itself, but i haven't figured out how yet
diabetes = diabetes.replace({'yes':1, 'Yes': 1, 'Positive':1, 'no':0, 'No':0, 'Negative':0, 'Male':1,'Female':0})

target = 'Diabetes'
y = diabetes[target]
X = diabetes.drop(columns=target)
y = y.replace({'Positive':1, 'Negative':0}) # again, format on original data set



# Some helpers

In [3]:
def fix_height(x, threshold=100):
    """ Converts height in meters to centimeters, if height is less than threshold (default = 100)"""
    col = x.columns[0]
    mask = x[col] < threshold
    x.loc[mask, [col]] = x.loc[mask, [col]].mul(100)
    return x


# Constructing Pipeline

I compose the pipeline from smaller pipelines, which all handles a subset of the tasks.

The numeric, binary and categorical columns are all handled differently. 
In addition, construct a parametric preprocessor where we can impute with domain knowledge. We must adapt the functions from the other script to do so, and I have only done that to a few easy ones.

For transformations which rely on other columns, like fixing obesity and polydipsia, we must use a slightly more complicated approach with classes, which I haven't attempted yet.

Row wise transformations, like outliers, must also be implemented, and I have not looked at that either.

In [4]:
binary_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0))
        # Differential privacy here
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=0.1, sparse=False)),
        # Unsure how to introduce privacy,
        # ("selector", SelectKBest(k='all')),
    ]
)

num_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           # Differential privacy here
           # Outliers Here
           ("scaler", StandardScaler())]
)

# Parametric preprocessor where we impute with domain knowledge
preprocessor_parametric = ColumnTransformer(
    transformers=[
        ('fix height', FunctionTransformer(fix_height), ['Height']),
    ],
    verbose_feature_names_out= False, # Keeps the same column name for future processing
    remainder='passthrough'         # Doesent drop untransformed columns
).set_output(transform='pandas')    # Keep data frame format


# General preprocesser which encodes and scales all features
preprocessor_general = ColumnTransformer(
    transformers=[
        #('fix height', FunctionTransformer(fix_height), ['Height']),
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
        ('binary', binary_transformer, binary_features)
    ],
    verbose_feature_names_out= True,
    remainder='drop'                # drop untouched features since after this step, as it is the last preprocessing one
).set_output(transform='pandas')    # Keep data frame format


preprocessor = Pipeline(
    steps=[
        ("preprocessor parametric", preprocessor_parametric), 
        ("preprocessor general", preprocessor_general), 
        ]
)


clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ("classifier", tree.DecisionTreeClassifier())
        ]
)


clf # Displays the pipeline




# Running pipeline
## Predicting


In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y) # Simple train-test spliot

clf.fit(X_train, y_train)
print("Train score: %.3f" % clf.score(X_train, y_train))
print("Test score: %.3f" % clf.score(X_test, y_test))

Train score: 1.000
Test score: 0.927


## Transformed data frame

Allows inspection into the final preprocessed data frame which the prediction model trains on

In [10]:
from IPython.display import display



In [11]:
display(preprocessor.fit_transform(X_train, y_train))

Unnamed: 0,num__Age,num__Height,num__Weight,num__Temperature,num__Urination,cat__Race_White,cat__Race_infrequent_sklearn,cat__Occupation_Retired,cat__Occupation_infrequent_sklearn,cat__GP_infrequent_sklearn,...,binary__Weakness,binary__Polyphagia,binary__Genital Thrush,binary__Visual Blurring,binary__Itching,binary__Irritability,binary__Delayed Healing,binary__Partial Paresis,binary__Muscle Stiffness,binary__Alopecia
461,-0.420716,-0.020769,-0.726775,-1.705570,0.338476,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
168,-0.848167,1.430922,0.973851,0.196415,-0.528456,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,0.814143,-0.225287,1.498432,0.746990,0.313706,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
290,-0.088254,-0.866554,-0.954662,1.547826,0.549016,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
257,-0.563200,1.518845,1.348823,-1.104943,-0.738996,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,-0.230738,-0.556911,-0.800633,-0.153950,-0.701842,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
361,0.386692,0.803034,0.679051,0.196415,-0.577995,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
297,0.624164,1.579054,2.257844,0.296520,0.610940,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
296,0.006735,-0.598005,0.418339,1.798087,-0.701842,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
