# | default_exp core

In [None]:
# | hide
import dagshub
import mlflow
import nbdev
from nbdev.showdoc import *

# | export
def foo():
    pass

In [None]:
# | hide
# this function allows us to get the experiment ID from an experiment name
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
# | hide
nbdev.nbdev_export()

## Stages of Pipeline Deployment

For LLMs, this is a data augmentation pipeline. Raw data will be augmented to compute one or more new columns. This needs to go through the familiar stages of Development, Staging, and Produciton.

### Development

LLMOps goals for Development/Evaluation are

1. track what is being done carefully for later auditing and reproducibility
2. package models or pipelines in a format which will make future deployment easier. 

We will:
* Load data
* Build an LLM pipeline
* Test applying the pipeline to data and log queries and results to MLflow Tracking
* Log the pipeline to the MLflow tracking server as an MLflow model

The EDA/desired transformations are not really done in this step. The example video mentions that the processing is done during the **course** and not in the LLMOps video. The video starts the workflow focusing on tracking.

### Staging

LLMOps goals for staging/testing/QA are
1. track the LLM's progress through testing and towards production
2. work programmatically to demonstrated the APIs needed for future CI/CD automation

We will:
* Register the pipeline to the MLflow Model Registry
* Test the pipeline on sample data
* Promote the registered model (pipeline) to production

### Production

LLMOps goals for production are 
1. write scale-out code that can meet scaling demands in the future
2. simplify deployment by using MLflow to write model-agnostic deployment code

We will:
1. Load the latest production LLM pipeline from the Model Registry
2. Apply the pipeline to an Apache Spark Dataframe
3. Append the results to a Delta Lake Table


## Notes about this workflow

### Notebook vs modular scripts
For a demo, everything in the workflow is divided into notebook sections, but this should really be split into separate notebooks or scripts

### Models vs code
Since the path here is tracked via MLflow Model Registry, this workflow promotes models over code. See "The Big Book of MLOps" for more discussion over the distinction (one difference is Model Registry vs Git)

# | Below this are blocks to use DagsHub with MLflow

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen"                        #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu"                     #@param {type:"string"}

#@markdown Enter the repo name 
DAGSHUB_REPO_NAME= ""                                   #@param {type:"string"}

#@markdown Enter the name of the branch you are working on 
BRANCH= ""                                              #@param {type:"string"}
dagshub.init(repo_name=DAGSHUB_REPO_NAME
             , repo_owner=DAGSHUB_USER_NAME)


In [None]:
mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/{DAGSHUB_REPO_NAME}.mlflow')

# starter idea for making an experiment name can be the git branch, but need more specificity
DAGSHUB_TEST_NAME = "stanza_quadgrams_small_set_v1"     #@param {type:"string"}
experiment_name = f"{DAGSHUB_EMAIL}/{DAGSHUB_TEST_NAME}"
mlflow_exp_id = get_experiment_id(experiment_name)

## DEVELOPMENT

In [None]:
# import necessary libraries to handle raw data
import dill as pickle
import dvc.api
import pandas as pd
from sklearn.feature_extraction.text import (
    CountVectorizer
    , TfidfTransformer
    , TfidfVectorizer
    ,
)
from src.custom_stanza_mlflow import StanzaWrapper
import src.dataframe_preprocessor as dfpp
import tqdm

### Prepare data

In [None]:
# load raw data and preprocess/clean
data = dvc.api.read(
        path='../data/raw/recipes-en-201706/epicurious-recipes_m2.json'
        mode='r')
raw_df = pd.read_json(data)

In [None]:
# Create subset for dev
dev_df = raw_df[0:50]

# pre_proc_df is cleaned dataframe
pre_proc_df = dfpp.preprocess_dataframe(dev_df)

### Convert data to Delta format?

In [None]:
import pyspark.pandas as ps

# save and log preprocessed dataframe(s)
prod_data_path = "../../data/processed/prod_data"
test_spark_dataset = ps.from_pandas(pre_proc_df)
test_spark_dataset.to_delta(path=prod_data_path,
                            mode='overwrite',
                            index='id')
mlflow.log_artifacts("../../data/processed/prod_data")

### Develop the pipeline

In [None]:
# create pipelines relevant to library used
# MLflow example uses HuggingFace
# below is example for MeaLeon with Stanza and sklearn NLP pipeline

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'language': 'english',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(cv_params)

with mlflow.start_run(experiment_id=mlflow_exp_id):
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # LOG MODEL
    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn TFIDFVectorizer
    tfidf_vectorizer_model = TfidfVectorizer(**cv_params)

    # Do fit transform on data
    test_tfidf_transform = tfidf_vectorizer_model.fit_transform(tqdm(pre_proc_df["ingredients"]))

    word_matrix = ps.DataFrame(
        test_tfidf_transform.toarray()
        , columns=tfidf_vectorizer_model.get_feature_names_out()
        , index=pre_proc_df.index
    )

    with open("../joblib/tfidf_transformer_small_test.pkl", "wb") as fo:
        pickle.dump(tfidf_vectorizer_model, fo)
        mlflow.log_artifact("../joblib/tfidf_transformer_small_test.pkl", artifact_path="sklearn_dill_pkls")

    with open("../joblib/database_word_matrix_small_test.pkl", "wb") as fo:
        pickle.dump(word_matrix, fo)
        mlflow.log_artifact("../joblib/database_word_matrix_small_test.pkl", artifact_path="sklearn_dill_pkls")
