# Pipeline Example 
This notebook acts as an example for the use of sklearn preprocessing a preprocessing pipeline

In [None]:
# imports, libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
from sklearn.preprocessing import StandardScaler, Binarizer, LabelEncoder, Normalizer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

#import preprocessing.pipe as pipe


### Step 1: Call preprocessing pipeline
If import pipe fails the working directory needs to be changed to import it.
The preprocessing_pipe-function can be transformed by changing the default parameters according to the planned ml-algorithm.

In [None]:
# import of pipe module
os.chdir('../preprocessing/')
import pipe

# calling preprocessing function
df = pipe.preprocessing_pipe()

### Step 2: Test for missing values in target feature
The target feature cannot have missing values, else sklearn will throw an exception. The target-feature now is 'loc' aka Location.

In [None]:
df['loc'].isna().sum()

In [None]:
df = df[df['loc'].notna()]  # selects only rows without missing values in target feature

### Step 3: Train test split
Before imputing missing values or scaling the features, a train test split should be made (utilizing sklearn.model_selection.train_test_split). Keep a representative split by using the parameter "startify".

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['loc']) # features
y = df['loc'].cat.codes # target feature

d = df['loc'].cat.categories


# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify = y)

### Step 4: Impute missing values and encode categorical features
With the help of the pipe.impute_and_encode function it is now time to impute missing values based on a predefined strategy and onehotencode categorical features. The function uses the Pipeline module from sklearn.

In [None]:
# filter for numeric and categorical features
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
category = ['category']

# select list of numeric and categorical features
numeric_features = X.select_dtypes(include=numerics).columns.tolist()
categorical_features = X.select_dtypes(include=category).columns.tolist()

# define imputer strategy (consult sklarn SimpleImputer and StandardScaler documentation for options)
imputer = {'categorical':{'strategy':'most_frequent', 'fill_value':'missing'}, 'numerical':{'strategy':'median', 'fill_value':'mean'}}
imputer_encoder = pipe.impute_and_encode(categorical_features, numeric_features, imputer)

### Step 5: Train a model
For this example we train a simple DecisionTree without any parameter optimization (aka definetly overfitting). Note that imputation and encoding would actually not be necessary for a DecisionTree.

In [None]:
# sklearn imports
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

dectree = DecisionTreeClassifier()

pipeline = Pipeline(steps=[('preprocessor', imputer_encoder),
                      ('classifier', dectree)])

# Specify the hyperparameter space
parameters = {}

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters, cv = 5)

# Fit to the training set
t = cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))

ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot();
print(cv.best_params_);

plt.figure(figsize=(20,20))
plot_tree(cv.best_estimator_['classifier'], fontsize=8)
plt.show()