# | default_exp core

In [None]:
# | hide
import dagshub
import mlflow
import nbdev
from nbdev.showdoc import *

# | export
def foo():
    pass

In [None]:
# | hide
# this function allows us to get the experiment ID from an experiment name
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
# | hide
nbdev.nbdev_export()

# | Below this are blocks to use DagsHub with MLflow

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

#@markdown Enter the repo name 
DAGSHUB_REPO_NAME="MeaLeon"

#@markdown Enter the name of the branch you are working on 
BRANCH="STANZA-1/refactor-nltk-stanza"
dagshub.init(repo_name=DAGSHUB_REPO_NAME
             , repo_owner=DAGSHUB_USER_NAME)


In [None]:
mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# starter idea for making an experiment name can be the git branch, but need more specificity
experiment_name = f"{DAGSHUB_EMAIL}/stanza_quadgrams_small_set_v1"
mlflow_exp_id = get_experiment_id(experiment_name)

In [None]:
# create pipelines relevant to library used
# MLflow example uses HuggingFace
# below is example for MeaLeon with Stanza and sklearn NLP pipeline

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'language': 'english',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(cv_params)

with mlflow.start_run(experiment_id=mlflow_exp_id):
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # import necessary libraries to handle raw data
    import dill as pickle
    import dvc.api
    import pandas as pd
    from sklearn.feature_extraction.text import (
        CountVectorizer
        , TfidfTransformer
        , TfidfVectorizer
        ,
    )
    from src.custom_stanza_mlflow import StanzaWrapper
    import src.dataframe_preprocessor as dfpp
    import tqdm
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/raw/recipes-en-201706/epicurious-recipes_m2.json'
           mode='r')
    raw_df = pd.read_json(data)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)

    # create subset for dev purposes
    to_nlp_df = pre_proc_df[0:50]

    # save and log preprocessed dataframe(s)
    pre_proc_df.to_json('../data/processed/preprocessed_dataframe.json')
    mlflow.log_artifact('../data/processed/preprocessed_dataframe.json', artifact_path="preprocessed_dataframes")
    
    to_nlp_df.to_json('../data/processed/preprocessed_subset_dataframe.json')
    mlflow.log_artifact('../data/processed/preprocessed_subset_dataframe.json', artifact_path="preprocessed_dataframes")
    
    # LOG MODEL
    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn TFIDFVectorizer
    tfidf_vectorizer_model = TfidfVectorizer(**cv_params)

    # Do fit transform on data
    test_tfidf_transform = tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["ingredients"]))

    word_matrix = pd.DataFrame(
        test_tfidf_transform.toarray()
        , columns=tfidf_vectorizer_model.get_feature_names_out()
        , index=to_nlp_df.index
    )

    with open("../joblib/tfidf_transformer_small_test.pkl", "wb") as fo:
        pickle.dump(tfidf_vectorizer_model, fo)
        mlflow.log_artifact("../joblib/tfidf_transformer_small_test.pkl", artifact_path="sklearn_dill_pkls")

    with open("../joblib/database_word_matrix_small_test.pkl", "wb") as fo:
        pickle.dump(word_matrix, fo)
        mlflow.log_artifact("../joblib/database_word_matrix_small_test.pkl", artifact_path="sklearn_dill_pkls")
