In [1]:
import warnings
warnings.simplefilter('always', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display

# Pipeline imports
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler,FunctionTransformer
from sklearn import tree
from sklearn.base import BaseEstimator, TransformerMixin


In [2]:

from typing import Literal


from sklearn.base import BaseEstimator


def BMI(weight, height):
    return weight/(height**2/(100*100))

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None, threshold_BMI=30, threshold_Polydipsia= 2.5):
        X = self._BMI(X, threshold=threshold_BMI)
        X = self._fix_polydipsia(X, threshold=threshold_Polydipsia)
        return X
    
    def _BMI(self, X, y=None, threshold=30):
        # Perform arbitary transformation


        idx = X[X['Obesity'].isna()].index
        
        # indexes to identify BMI above or below threshold
        idx2 = X.loc[idx,].loc[BMI(X.loc[idx,]["Weight"], X.loc[idx,]["Height"]) <= threshold].index
        idx3 = X.loc[idx,].loc[BMI(X.loc[idx,]["Weight"], X.loc[idx,]["Height"]) > threshold].index
        
        # set obesity from indexes above
        X.loc[idx2,'Obesity'] = 0
        X.loc[idx3,'Obesity'] = 1
        return X
    
    def _fix_polydipsia(self, df, threshold=2.5):
        idx = df[df['Polydipsia'].isna()].index
        
        # indexes to identify Urination above or below threshold
        idx2 = df.loc[idx,].loc[df['Urination'] <= threshold].index
        idx3 = df.loc[idx,].loc[df['Urination'] > threshold].index

        # set Polydipsia from indexes above
        df.loc[idx2,'Polydipsia'] = 0
        df.loc[idx3,'Polydipsia'] = 1
        #df.loc[idx,]
        return df
    
    def set_output(self, *, transform: Literal['default', 'pandas'] | None = None) -> BaseEstimator:
        return super().set_output(transform=transform)


class AddBMI(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X['BMI'] = BMI(X['Weight'], X['Height'])
        return X
    def set_output(self, *, transform: Literal['default', 'pandas'] | None = None) -> BaseEstimator:
        return super().set_output(transform=transform)

class Outliers(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        bounds = self._outliers_z_score(X)

        for f in bounds:
            outliers = self._outliers_min_max(X, f,
                                        min=bounds[0],
                                        max=bounds[1]
                                        )
            X.loc[outliers.index, f] = np.NaN

        assert type(X) == 'DataFrame'

        return X
    

    def _outliers_z_score(df, feature, no_z=3):
        lower = df[feature].mean()-no_z*df[feature].std()
        upper = df[feature].mean()+no_z*df[feature].std()
        return lower, upper
    
    def _outliers_min_max(df, feature, min=None, max=None):
        try:
            cond_min = df[feature] < min if min != None else False
            cond_max = df[feature] > max if max != None else False
            return df[cond_min | cond_max ]
        except Exception as e:
            print("invalid feature")

    def get_feature_names_out(self):
        pass

    def set_output(self, *, transform: Literal['default', 'pandas'] | None = None) -> BaseEstimator:
        return super().set_output(transform=transform)


# Read data

I chose to read data directly for ease of testing

In [3]:

diabetes = pd.read_csv('diabetes.csv')
binary_features = ['Obesity', 'TCep', 'Polydipsia', 'Sudden Weight Loss', 'Weakness',
                'Polyphagia', 'Genital Thrush', 'Visual Blurring', 'Itching',
                'Irritability', 'Delayed Healing', 'Partial Paresis', 'Muscle Stiffness', 'Alopecia', 'Gender']
cat_features = ['Race',	'Occupation',	'GP']
num_features = ['Age',	'Height',	'Weight',	'Temperature',	'Urination']


target = 'Diabetes'
y = diabetes[target]
X = diabetes.drop(columns=target)
# y = y.replace({'Positive':1, 'Negative':0}) # again, format on original data set



# Some helpers

In [4]:
def fix_height(x, threshold=100):
    """ Converts height in meters to centimeters, if height is less than threshold (default = 100)"""
    col = x.columns[0]
    mask = x[col] < threshold
    x.loc[mask, [col]] = x.loc[mask, [col]].mul(100)
    return x

def fix_formating(x):
    return x.replace({'yes':1, 'Yes': 1, 'Positive':1, 'no':0, 'No':0, 'Negative':0, 'Male':1,'Female':0})


# Constructing Pipeline

I compose the pipeline from smaller pipelines, which all handles a subset of the tasks.

The numeric, binary and categorical columns are all handled differently. 
In addition, construct a parametric preprocessor where we can impute with domain knowledge. We must adapt the functions from the other script to do so, and I have only done that to a few easy ones.

For transformations which rely on other columns, like fixing obesity and polydipsia, we must use a slightly more complicated approach with classes, which I haven't attempted yet.

Row wise transformations, like outliers, must also be implemented, and I have not looked at that either.

In [5]:
# Parametric preprocessor where we impute with domain knowledge
preprocessor_parametric = ColumnTransformer(
    transformers=[
        ('fix height', FunctionTransformer(fix_height), ['Height']),
    ],
    verbose_feature_names_out= False, # Keeps the same column name for future processing
    remainder='passthrough'         # Doesent drop untransformed columns
).set_output(transform='pandas')    # Keep data frame format

binary_transformer = Pipeline(
    steps=[
        ('Fix formating', FunctionTransformer(fix_formating)),
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
        # Differential privacy here
        # ("selector", SelectKBest(k=3)),
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="infrequent_if_exist", min_frequency=0.1, sparse=False)),
        # Unsure how to introduce privacy,
        # ("selector", SelectKBest(k=3)),
    ]
)

num_transformer = Pipeline(
    steps=[
           # Differential privacy here
           # Outliers Here
            # ('Outliers', Outliers()), # BROKEN
            ("imputer", SimpleImputer(strategy="mean")), 
            ("scaler", StandardScaler())]
)


# General preprocesser which encodes and scales all features
preprocessor_general = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
        ('binary', binary_transformer, binary_features)
    ],
    verbose_feature_names_out= True,
    remainder='drop'                # drop untouched features since after this step, as it is the last preprocessing one
).set_output(transform="pandas")    # Keep data frame format


preprocessor = Pipeline(
    steps=[
        ('Custom impute', CustomTransformer()),
        # ('Add columns', AddBMI()),
        ("preprocessor parametric", preprocessor_parametric), 
        ("preprocessor general", preprocessor_general), 
        ]
)


clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ("classifier", tree.DecisionTreeClassifier())
        ]
)


def make_clf(clf):
    return Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ("classifier", clf)
        ]
    )

clf # Displays the pipeline




AttributeError: This 'CustomTransformer' has no attribute 'set_output'

# Running pipeline
## Predicting


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y) # Simple train-test spliot

clf.fit(X_train, y_train)
print("Train score: %.3f" % clf.score(X_train, y_train))
print("Test score: %.3f" % clf.score(X_test, y_test))

TypeError: 'Outliers' object is not subscriptable

## Transformed data frame

Allows inspection into the final preprocessed data frame which the prediction model trains on

In [None]:
display(preprocessor.fit_transform(X_train, y_train))


Unnamed: 0,num__Age,num__Height,num__Weight,num__Temperature,num__Urination,cat__Race_White,cat__Race_infrequent_sklearn,cat__Occupation_Manager,cat__Occupation_Retired,cat__Occupation_infrequent_sklearn,...,binary__Polyphagia,binary__Genital Thrush,binary__Visual Blurring,binary__Itching,binary__Irritability,binary__Delayed Healing,binary__Partial Paresis,binary__Muscle Stiffness,binary__Alopecia,binary__Gender
452,-0.417575,-0.797727,-1.477687,0.666871,0.297890,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
503,-0.171111,0.965308,-0.413059,1.455879,-0.808163,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
110,-0.565454,-0.576390,-0.868226,-0.122137,-0.836524,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
37,0.814745,-0.043646,-0.581496,1.160001,0.562587,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
402,0.075353,-0.670290,-0.312125,-0.171450,-0.883791,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,1.159795,1.125323,0.005464,-0.171450,0.401878,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
254,0.000000,0.294588,-0.042110,0.370993,-0.571827,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
373,0.075353,-2.549264,-0.746720,0.173741,0.751656,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
64,-0.516161,-1.148418,-0.857940,-0.270076,0.174995,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


In [None]:

for depth in range(1,20):
    print('\nmax depth:', depth)
    clf = make_clf(tree.DecisionTreeClassifier(max_depth=None)).fit(X_train, y_train)
    clf.fit(X_train, y_train)
    print("\tTrain score: %.3f" % clf.score(X_train, y_train))
    print("\tTest score: %.3f" % clf.score(X_test, y_test))


max depth: 1
	Train score: 1.000
	Test score: 0.832

max depth: 2
	Train score: 1.000
	Test score: 0.832

max depth: 3
	Train score: 1.000
	Test score: 0.839

max depth: 4
	Train score: 1.000
	Test score: 0.839

max depth: 5
	Train score: 1.000
	Test score: 0.832

max depth: 6
	Train score: 1.000
	Test score: 0.825

max depth: 7
	Train score: 1.000
	Test score: 0.839

max depth: 8
	Train score: 1.000
	Test score: 0.847

max depth: 9
	Train score: 1.000
	Test score: 0.832

max depth: 10
	Train score: 1.000
	Test score: 0.832

max depth: 11
	Train score: 1.000
	Test score: 0.854

max depth: 12
	Train score: 1.000
	Test score: 0.839

max depth: 13
	Train score: 1.000
	Test score: 0.832

max depth: 14
	Train score: 1.000
	Test score: 0.832

max depth: 15
	Train score: 1.000
	Test score: 0.818

max depth: 16
	Train score: 1.000
	Test score: 0.832

max depth: 17
	Train score: 1.000
	Test score: 0.825

max depth: 18
	Train score: 1.000
	Test score: 0.847

max depth: 19
	Train score: 1.000
	T