[Reference](https://betterprogramming.pub/making-a-ml-models-production-ready-with-modelkit-our-mlops-python-library-af877cda0ca1)

# Initialisation


In [1]:
# python libraries
!pip install modelkit spacy scikit-learn tensorflow numpy
# spacy model
!python -m spacy download en_core_web_sm
# imdb sentiment archive
!curl https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz --output imdb.tar.gz
!tar -xvf imdb.tar.gz 
!rm -rf aclImdb/train/unsup

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
aclImdb/train/unsup/44983_0.txt
aclImdb/train/unsup/44982_0.txt
aclImdb/train/unsup/44981_0.txt
aclImdb/train/unsup/44980_0.txt
aclImdb/train/unsup/44979_0.txt
aclImdb/train/unsup/44978_0.txt
aclImdb/train/unsup/44977_0.txt
aclImdb/train/unsup/44976_0.txt
aclImdb/train/unsup/44975_0.txt
aclImdb/train/unsup/44974_0.txt
aclImdb/train/unsup/44973_0.txt
aclImdb/train/unsup/44972_0.txt
aclImdb/train/unsup/44971_0.txt
aclImdb/train/unsup/44970_0.txt
aclImdb/train/unsup/44969_0.txt
aclImdb/train/unsup/44968_0.txt
aclImdb/train/unsup/44967_0.txt
aclImdb/train/unsup/44966_0.txt
aclImdb/train/unsup/44965_0.txt
aclImdb/train/unsup/44964_0.txt
aclImdb/train/unsup/44963_0.txt
aclImdb/train/unsup/44962_0.txt
aclImdb/train/unsup/44961_0.txt
aclImdb/train/unsup/44960_0.txt
aclImdb/train/unsup/44959_0.txt
aclImdb/train/unsup/44958_0.txt
aclImdb/train/unsup/44957_0.txt
aclImdb/train/unsup/44956_0.txt
aclImdb/train/unsup/44955_0.txt
aclImdb

# A first model


In [6]:
import modelkit
import spacy


class Tokenizer(modelkit.Model):

    def _load(self):
        self.nlp = spacy.load(
            "en_core_web_sm",
            disable=[
                "parser",
                "ner",
                "tagger",
                "lemmatizer",
                "tok2vec",
                "attribute_ruler",
            ],
        )

    def _predict(self, text):
        text = " ".join(text.replace("<br", "").replace("/>", "").split())
        return [
            t.lower_
            for t in self.nlp(text)  # self.nlp is guaranteed to be initialized
            if t.is_ascii and len(t) > 1 and not (t.is_punct or t.is_stop or t.is_digit)
        ]

tokenizer = Tokenizer()
tokenizer.predict("spaCy is a great lib for NLP 😀") # print ['spacy', 'great', 'lib', 'nlp']

# A complete model


In [5]:
from typing import List
import modelkit
import spacy

class Tokenizer(modelkit.Model[str, List[str]]):
    CONFIGURATIONS = {"imdb_tokenizer": {}}
    TEST_CASES = [
        {"item": "NLP 101", "result": ["nlp"]},
        {
            "item": "I'm loving the spaCy 101 course !!!",
            "result": ["loving", "spacy", "course"],
        },
        {
            "item": "<br/>prepare things for IMDB<br/>",
            "result": ["prepare", "things", "imdb"],
        }
    ]

    def _load(self):
        self.nlp = spacy.load(
            "en_core_web_sm",
            disable=[
                "parser",
                "ner",
                "tagger",
                "lemmatizer",
                "tok2vec",
                "attribute_ruler",
            ],
        )

    def _predict_batch(self, texts):
        texts = [
            " ".join(text.replace("<br", "").replace("/>", "").split())
            for text in texts
        ]
        return [
            [
                t.lower_
                for t in text
                if t.is_ascii
                and len(t) > 1
                and not (t.is_punct or t.is_stop or t.is_digit)
            ]
            for text in self.nlp.pipe(texts, batch_size=len(texts))
        ]
        
tokenizer=Tokenizer()
tokenizer.predict("spaCy is a great lib for NLP 😀") # print ['spacy', 'great', 'lib', 'nlp']
tokenizer.predict_batch(["spaCy is a great lib for NLP 😀", ""]) # print [['spacy', 'great', 'lib', 'nlp'], []]
tokenizer.predict_gen(["spaCy is a great lib for NLP 😀", ""])

# 2. Implementing a Vectorizer leveraging Scikit-Learn


In [4]:
import glob
import itertools
import os
from typing import Generator
from sklearn.feature_extraction.text import TfidfVectorizer

def read_dataset(path: str) -> Generator[str, None, None]:
    for review in glob.glob(os.path.join(path, "*.txt")):
        with open(review, 'r', encoding='utf-8') as f:
            yield f.read()

training_set = itertools.chain(
    read_dataset(os.path.join("aclImdb", "train", "pos")),
    read_dataset(os.path.join("aclImdb", "train", "neg")),
)

tokenized_set = Tokenizer().predict_gen(training_set, batch_size=64)

vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x, lowercase=False, max_df=0.95, min_df=0.01
).fit(tokenized_set)

# we only keep strings from the vocabulary
# we will be using our own str -> int mapping
vocabulary = next(zip(*sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])))
with open("vocabulary.txt", "w", encoding="utf-8") as f:
    for row in vocabulary:
        f.write(row + "\n")

In [7]:
# import modelkit
# import numpy as np
# from typing import List

# class Vectorizer(modelkit.Model[List[str], List[int]]):
#     CONFIGURATIONS = {"imdb_vectorizer": {"asset": "vocabulary.txt"}}
#     TEST_CASES = [
#         {"item": ["movie"], "result": [888]},
#         {"item": ["movie", "unknown_token", "scenes"], "result": [888, 1156]},
#         {
#             "item": ["movie", "unknown_token", "scenes"],
#             "keyword_args": {"drop_oov": False},
#             "result": [888, 1, 1156],
#         },
#         {
#             "item": ["movie", "unknown_token", "scenes"],
#             "keyword_args": {"length": 10, "drop_oov": False},
#             "result": [888, 1, 1156, 0, 0, 0, 0, 0, 0, 0],
#         },
#     ]

#     def _load(self):
#         self.vocabulary = {}
#         with open(self.asset_path, "r", encoding="utf-8") as f:
#             for i, k in enumerate(f):
#                 self.vocabulary[k.strip()] = i + 2
#         self._vectorizer = np.vectorize(lambda x: self.vocabulary.get(x, 1))

#     def _predict(self, tokens, length=None, drop_oov=True):
#         vectorized = (
#             np.array(self._vectorizer(tokens), dtype=np.int)
#             if tokens
#             else np.array([], dtype=int)
#         )
#         if drop_oov and len(vectorized):
#             vectorized = np.delete(vectorized, vectorized == 1)
#         if not length:
#             return vectorized.tolist()
#         result = np.zeros(length)
#         vectorized = vectorized[:length]
#         result[: len(vectorized)] = vectorized
#         return result.tolist()

# 3. Building a simple Classifier leveraging Keras


In [10]:
import tensorflow as tf

model_library = modelkit.ModelLibrary(models=[Vectorizer, Tokenizer])
tokenizer = model_library.get("imdb_tokenizer")
vectorizer = model_library.get("imdb_vectorizer")
    
def process(path, tokenizer, vectorizer, length=64, batch_size=64):
    # cf. tutorial in https://cornerstone-ondemand.github.io/modelkit/examples/nlp_sentiment/classifier/
        
def build_set(data_type):
    return tf.data.Dataset.from_generator(
        lambda: process(
            os.path.join("aclImdb", data_type),
            tokenizer,
            vectorizer,
        ),
        output_types=(tf.int16, tf.int16),
    )
    .batch(64)
    .repeat()
)

training_set = build_set("train")
validation_set = build_set("test")
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=len(vectorizer.vocabulary) + 2, output_dim=64, input_length=64
        ),
        tf.keras.layers.Lambda(lambda x: tf.reduce_sum(x, axis=1)),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.compile(
    tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.binary_accuracy],
)
model.build()
model.fit(
    training_set,
    validation_data=validation_set,
    epochs=10,
    steps_per_epoch=100,
    validation_steps=10,
)
model.save(
    "imdb_model.h5", include_optimizer=False, save_format="h5", save_traces=False
)

In [None]:
import modelkit
import tensorflow as tf
from typing import List, Optional

class MovieReviewItem(pydantic.BaseModel):
    text: str
    rating: Optional[float] = None  # could be useful in the future ? but not mandatory


class MovieSentimentItem(pydantic.BaseModel):
    label: str
    score: float


class Classifier(modelkit.Model[MovieReviewItem, MovieSentimentItem]):
    CONFIGURATIONS = {
        "imdb_classifier": {
            "asset": "imdb_model.h5",
            "model_dependencies": {
                "tokenizer": "imdb_tokenizer",
                "vectorizer": "imdb_vectorizer",
            },
        },
    }
    TEST_CASES = [
        {
            "item": {"text": "i love this film, it's the best I've ever seen"},
            "result": {"score": 0.8441019058227539, "label": "good"},
        },
        {
            "item": {"text": "this movie sucks, it's the worst I have ever seen"},
            "result": {"score": 0.1625385582447052, "label": "bad"},
        },
    ]

    def _load(self):
        self.model = tf.keras.models.load_model(self.asset_path)
        self.tokenizer = self.model_dependencies["tokenizer"]
        self.vectorizer = self.model_dependencies["vectorizer"]

    def _predict_batch(self, reviews):
        texts = [review.text for review in reviews]
        tokenized_reviews = self.tokenizer.predict_batch(texts)
        vectorized_reviews = self.vectorizer.predict_batch(tokenized_reviews, length=64)
        predictions_scores = self.model.predict(vectorized_reviews)
        predictions = [
            {"score": score, "label": "good" if score >= 0.5 else "bad"}
            for score in predictions_scores
        ]
        return predictions


model_library = modelkit.ModelLibrary(models=[Tokenizer, Vectorizer, Classifier])
classifier = model_library.get("imdb_classifier")
prediction = classifier.predict({"text": "I love the main character"})
print(prediction.label)

In [11]:
import modelkit
import tensorflow as tf
from typing import List, Optional

class MovieReviewItem(pydantic.BaseModel):
    text: str
    rating: Optional[float] = None  # could be useful in the future ? but not mandatory


class MovieSentimentItem(pydantic.BaseModel):
    label: str
    score: float


class Classifier(modelkit.Model[MovieReviewItem, MovieSentimentItem]):
    CONFIGURATIONS = {
        "imdb_classifier": {
            "asset": "imdb_model.h5",
            "model_dependencies": {
                "tokenizer": "imdb_tokenizer",
                "vectorizer": "imdb_vectorizer",
            },
        },
    }
    TEST_CASES = [
        {
            "item": {"text": "i love this film, it's the best I've ever seen"},
            "result": {"score": 0.8441019058227539, "label": "good"},
        },
        {
            "item": {"text": "this movie sucks, it's the worst I have ever seen"},
            "result": {"score": 0.1625385582447052, "label": "bad"},
        },
    ]

    def _load(self):
        self.model = tf.keras.models.load_model(self.asset_path)
        self.tokenizer = self.model_dependencies["tokenizer"]
        self.vectorizer = self.model_dependencies["vectorizer"]

    def _predict_batch(self, reviews):
        texts = [review.text for review in reviews]
        tokenized_reviews = self.tokenizer.predict_batch(texts)
        vectorized_reviews = self.vectorizer.predict_batch(tokenized_reviews, length=64)
        predictions_scores = self.model.predict(vectorized_reviews)
        predictions = [
            {"score": score, "label": "good" if score >= 0.5 else "bad"}
            for score in predictions_scores
        ]
        return predictions


model_library = modelkit.ModelLibrary(models=[Tokenizer, Vectorizer, Classifier])
classifier = model_library.get("imdb_classifier")
prediction = classifier.predict({"text": "I love the main character"})
print(prediction.label)

# 4. pushing our model scalability and robustness


In [12]:
from modelkit.core.profilers.simple import SimpleProfiler
profiler = SimpleProfiler(classifier)
print(classifier.predict({"text": "I love the main character"}))
print(profiler.summary(print_table=True))

In [13]:
import fastapi

app = fastapi.FastAPI()
model_library = modelkit.ModelLibrary(models=[Classifier, Vectorizer, Tokenizer])
app.state.lib = model_library

@app.post("/classifier/", response_model=MovieSentimentItem)
def classifier_endpoint(request: fastapi.Request, item: MovieReviewItem):
    m = request.app.state.lib.get("imdb_classifier")
    return m.predict(item)

@app.on_event("shutdown")
async def shutdown_event():
    await app.state.lib.aclose()