## 1. Example of an Inference Pipeline


### 1.1 Example Implementation
* See folder `Exercise 10 - which is an implementation of an inference pipeline with one component
* The code implements a pipeline where the data is transformed in mutiple ways

In [None]:
# here is an example of a basic pipeline, with the data only processed one way

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline

pipe = Pipeline(
  steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("model", LogisticRegression())
  ]
)

# OR
pipe = make_pipeline(SimpleImputer(), StandardScaler(), LogisticRegression())

In [None]:
# but we can actually define separate steps in the pipeline to treat different columns in different ways
# here we define the transformers and pass them to a column transformer object
# we then make a pipeline consisting of the transformer object and the model

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


# Example dataframe from the sklearn docs
df = pd.DataFrame(
    {'city': ['London', 'London', 'Paris', 'Sallisaw'],
     'title': ["His Last Bow", "How Watson Learned the Trick",
               "A Moveable Feast", "The Grapes of Wrath"],
     'expert_rating': [5, 3, 4, 5],
     'user_rating': [4, 5, 4, 3],
     'click': ['yes', 'no', 'no', 'yes']})
y = df.pop("click")
X = df

# Build a Column transformer
categorical_preproc = OneHotEncoder()
text_preproc = TfidfVectorizer()
numerical_preprocessing = make_pipeline(SimpleImputer(), StandardScaler())
preproc = ColumnTransformer(
    transformers=[
        ("cat_transform", categorical_preproc, ['city']),
        ("text_transform", text_preproc, 'title'),
        ("num_transform", numerical_preprocessing, ['expert_rating', 'user_rating'])
    ],
    remainder='drop'
)
pipe = make_pipeline(preproc, LogisticRegression())
pipe.fit(X, y)

#### Column transformer and pipeline in example

In [None]:
## in the exercise 10 example, we do something similar:
## note that this is in the run.py script in the random_forest component

preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
            ("nlp1", nlp_transformer, nlp_features),
        ],
        remainder="drop",  # This drops the columns that we do not transform
    )

pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(**model_config)),
        ]
    )

#### Model config
* One thing that confused me is how the model config (for the random forest) is passed in.
* The answer: from the hydra config.yml file
* As the config needs to go to the random_forest component, it is a little confusing!

In [None]:
## in the run.py script in the random_forest component, we have:

# Get the configuration for the model
    with open(args.model_config) as fp:
        model_config = json.load(fp)
    # Add it to the W&B configuration so the values for the hyperparams
    # are tracked
    wandb.config.update(model_config)

In [None]:
## and in the MLProject file for this component, we have the args.model_config ref:

name: decision_tree
conda_env: conda.yml

entry_points:
  main:
    parameters:
      train_data:
        description: Fully-qualified name for the training data artifact
        type: str
      model_config:
        description: JSON blurb containing the configuration for the decision tree
        type: str
    command: >-
      python run.py --train_data {train_data} \
                    --model_config {model_config}


In [None]:
## zooming out to the main.py in the main component:

## note that here we are creating a file and then writing config["random_forest"] to it
## we then use this file to pass in the model parameters for the mlflow.run which calls the component.
## interesting that we have to do this, and can't just pass the config entries straight to the model

 # Serialize decision tree configuration
    model_config = os.path.abspath("random_forest_config.json")

    with open(model_config, "w+") as fp:
        json.dump(dict(config["random_forest"]), fp)

    _ = mlflow.run(
        os.path.join(root_path, "random_forest"),
        "main",
        parameters={
            "train_data": config["data"]["train_data"],
            "model_config": model_config
        },
    )

In [None]:
## for reference, here is the config.yml file:

main:
  project_name: exercise_10
  experiment_name: dev
data:
  train_data: "exercise_6/data_train.csv:latest"
random_forest:
  n_estimators: 100
  criterion: 'gini'
  max_depth: null
  min_samples_split: 2
  min_samples_leaf: 1
  min_weight_fraction_leaf: 0.0
  max_features: 'auto'
  max_leaf_nodes: null
  min_impurity_decrease: 0.0
  min_impurity_split: null
  bootstrap: true
  oob_score: false
  n_jobs: null
  random_state: null
  verbose: 0
  warm_start: false
  class_weight: null
  ccp_alpha: 0.0
  max_samples: null

### Pipeline implementation in PyTorch

In [None]:
## example

import torch
from torchvision import transforms
from torch.nn import Sequential, Softmax
from PIL import Image
import numpy as np

# Get a pre-trained model
model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet18', pretrained=True)
model.eval()

# Define the inference pipeline
pipe = Sequential(
    # NOTE: for the pipeline to be scriptable with script,
    # you must use a list [256, 256] instead of just one number (256)
    transforms.Resize([256, 256]),
    transforms.CenterCrop([224, 224]),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    model,
    Softmax(1)
)

# Save inference artifact using torch.script
scripted = torch.jit.script(pipe)
scripted.save("inference_artifact.pt")

# NOTE: normally we would upload it to the artifact store

# Load inference artifact
pipe_reload = torch.jit.load("inference_artifact.pt")

# Load one example
# NOTE: these operations are usually taken care by the inference
# engine
img = Image.open("dog.jpg")
img.load()
# Make into a batch of 1 element
data = transforms.ToTensor()(np.asarray(img, dtype="uint8").copy()).unsqueeze(0)

# Perform inference
with torch.no_grad():
    logits = pipe_reload(data).detach()

proba = logits[0]

# Transform to class and print answer
with open("imagenet_classes.txt", "r") as f:
    classes = [s.strip() for s in f.readlines()]
print(f"Classification: {classes[proba.argmax()]}")