# | default_exp core

In [None]:
# | hide
import dagshub
import mlflow
import nbdev
from nbdev.showdoc import *

# | export
def foo():
    pass

In [None]:
# | hide
# this function allows us to get the experiment ID from an experiment name
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
# | hide
nbdev.nbdev_export()

# | Below this are blocks to use DagsHub with MLflow

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

#@markdown Enter the repo name 
DAGSHUB_REPO_NAME="MeaLeon"

#@markdown Enter the name of the branch you are working on 
BRANCH="STANZA-1/refactor-nltk-stanza"
dagshub.init(repo_name=DAGSHUB_REPO_NAME
             , repo_owner=DAGSHUB_USER_NAME)


Token Dagshub OAuth token, valid until 2023-12-02 04:53:30+00:00 does not exist in the storage




Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=5a600d08-8162-43a0-a644-b3f2a53c89c0&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=ac2d0d3591d0ecabd5d482d46fe51d517cf674717987c5cfbd1e46ae6342a1fc




Output()

In [None]:
mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')
mlflow.environment_variables.MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR='MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR'

# starter idea for making an experiment name can be the git branch, but need more specificity
experiment_name = f"{DAGSHUB_EMAIL}/stanza_quadgrams_small_set_v1"
mlflow_exp_id = get_experiment_id(experiment_name)

In [None]:
# create pipelines relevant to library used
# MLflow example uses HuggingFace
# below is example for MeaLeon with Stanza and sklearn NLP pipeline

with mlflow.start_run(experiment_id=mlflow_exp_id):
    # import necessary libraries to handle raw data
    import dill as pickle
    import dvc.api
    import pandas as pd
    from sklearn.feature_extraction.text import (
        CountVectorizer
        , TfidfTransformer
        , TfidfVectorizer
        ,
    )
    from src.custom_stanza_mlflow import StanzaWrapper
    import src.dataframe_preprocessor as dfpp
    import stanza
    from tqdm import tqdm
    
    stanza.download('en')
    nlp = stanza.Pipeline('en')
    
    # cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
    cv_params = {
        'strip_accents':"unicode",
        'lowercase':True,
        'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
        'min_df':10,
    }

    # pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
    pipeline_params = {
        'stanza_model': 'en',
        'language': 'english',
        'sklearn-transformer': 'TfidfVectorizer'
    }

    # update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
    pipeline_params.update(cv_params)

    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)
    print('\n')
    print('--------------')
    print("Parameters Logged in MLflow")

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print('Raw Dataframe:', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print('Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)

    # create subset for dev purposes
    to_nlp_df = pre_proc_df[0:50]
    to_nlp_df

    # save and log preprocessed dataframe(s)
    pre_proc_df.to_json('../data/processed/preprocessed_dataframe.json')
    mlflow.log_artifact('../data/processed/preprocessed_dataframe.json', artifact_path="preprocessed_dataframes")
    
    to_nlp_df.to_json('../data/processed/preprocessed_subset_dataframe.json')
    mlflow.log_artifact('../data/processed/preprocessed_subset_dataframe.json', artifact_path="preprocessed_dataframes")
    
    print('\n')
    print('--------------')
    print('Dataframes logged as MLflow artifacts')

    # LOG MODEL
    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn TFIDFVectorizer
    tfidf_vectorizer_model = TfidfVectorizer(**cv_params)

    # Do fit transform on data
    test_tfidf_transform = tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["ingredients"]))

    word_matrix = pd.DataFrame(
        test_tfidf_transform.toarray()
        , columns=tfidf_vectorizer_model.get_feature_names_out()
        , index=to_nlp_df.index
    )

    with open("../joblib/tfidf_transformer_small_test.pkl", "wb") as fo:
        pickle.dump(tfidf_vectorizer_model, fo)
        mlflow.log_artifact("../joblib/tfidf_transformer_small_test.pkl", artifact_path="sklearn_dill_pkls")

    with open("../joblib/database_word_matrix_small_test.pkl", "wb") as fo:
        pickle.dump(word_matrix, fo)
        mlflow.log_artifact("../joblib/database_word_matrix_small_test.pkl", artifact_path="sklearn_dill_pkls")

    print('\n')
    print('--------------')
    print('sklearn dill/pkls logged as MLflow artifacts')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-01 23:28:04 INFO: Downloading default packages for language: en (English) ...
2023-12-01 23:28:05 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2023-12-01 23:28:08 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2023-12-01 23:28:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-01 23:28:09 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-12-01 23:28:09 INFO: Using device: cuda
2023-12-01 23:28:09 INFO: Loading: tokenize
2023-12-01 23:28:09 INFO: Loading: pos
2023-12-01 23:28:09 INFO: Loading: lemma
2023-12-01 23:28:09 INFO: Loading: constituency
2023-12-01 23:28:10 INFO: Loading: depparse
2023-12-01 23:28:10 INFO: Loading: sentiment
2023-12-01 23:28:10 INFO: Loading: ner
2023-12-01 23:28:11 INFO: Done loading processors!




--------------
Parameters Logged in MLflow


--------------
Raw Dataframe:
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09

100%|██████████| 50/50 [00:30<00:00,  1.66it/s]




--------------
sklearn dill/pkls logged as MLflow artifacts


In [None]:
mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')
mlflow.environment_variables.MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR='MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR'

# starter idea for making an experiment name can be the git branch, but need more specificity
experiment_name = f"{DAGSHUB_EMAIL}/stanza_quadgrams_full_set_v1"
mlflow_exp_id = get_experiment_id(experiment_name)

In [None]:
# create pipelines relevant to library used
# MLflow example uses HuggingFace
# below is example for MeaLeon with Stanza and sklearn NLP pipeline

with mlflow.start_run(experiment_id=mlflow_exp_id):
    # import necessary libraries to handle raw data
    import dill as pickle
    import dvc.api
    import pandas as pd
    from sklearn.feature_extraction.text import (
        CountVectorizer
        , TfidfTransformer
        , TfidfVectorizer
        ,
    )
    from src.custom_stanza_mlflow import StanzaWrapper
    import src.dataframe_preprocessor as dfpp
    import stanza
    from tqdm import tqdm
    
    stanza.download('en')
    nlp = stanza.Pipeline('en')
    
    # cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
    cv_params = {
        'strip_accents':"unicode",
        'lowercase':True,
        'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
        'min_df':10,
    }

    # pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
    pipeline_params = {
        'stanza_model': 'en',
        'language': 'english',
        'sklearn-transformer': 'TfidfVectorizer'
    }

    # update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
    pipeline_params.update(cv_params)

    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)
    print('\n')
    print('--------------')
    print("Parameters Logged in MLflow")

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print('Raw Dataframe:', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print('Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)

    # create subset for dev purposes
    # to_nlp_df = pre_proc_df[0:50]
    # to_nlp_df

    # save and log preprocessed dataframe(s)
    pre_proc_df.to_json('../data/processed/preprocessed_dataframe.json')
    mlflow.log_artifact('../data/processed/preprocessed_dataframe.json', artifact_path="preprocessed_dataframes")
    
    # to_nlp_df.to_json('../data/processed/preprocessed_subset_dataframe.json')
    # mlflow.log_artifact('../data/processed/preprocessed_subset_dataframe.json', artifact_path="preprocessed_dataframes")
    
    print('\n')
    print('--------------')
    print('Dataframes logged as MLflow artifacts')

    # LOG MODEL
    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn TFIDFVectorizer
    tfidf_vectorizer_model = TfidfVectorizer(**cv_params)

    # Do fit transform on data
    test_tfidf_transform = tfidf_vectorizer_model.fit_transform(tqdm(pre_proc_df["ingredients"]))

    word_matrix = pd.DataFrame(
        test_tfidf_transform.toarray()
        , columns=tfidf_vectorizer_model.get_feature_names_out()
        , index=pre_proc_df.index
    )

    print('\n')
    print('--------------')
    print('Word Matrix:', end='\n')
    print(word_matrix.head())

    with open("../joblib/tfidf_transformer.pkl", "wb") as fo:
        pickle.dump(tfidf_vectorizer_model, fo)
        mlflow.log_artifact("../joblib/tfidf_transformer.pkl", artifact_path="sklearn_dill_pkls")

    with open("../joblib/database_word_matrix.pkl", "wb") as fo:
        pickle.dump(word_matrix, fo)
        mlflow.log_artifact("../joblib/database_word_matrix.pkl", artifact_path="sklearn_dill_pkls")

    print('\n')
    print('--------------')
    print('sklearn dill/pkls logged as MLflow artifacts')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-02 21:59:14 INFO: Downloading default packages for language: en (English) ...
2023-12-02 21:59:16 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2023-12-02 21:59:20 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2023-12-02 21:59:20 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-02 21:59:21 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-12-02 21:59:21 INFO: Using device: cuda
2023-12-02 21:59:21 INFO: Loading: tokenize
2023-12-02 21:59:25 INFO: Loading: pos
2023-12-02 21:59:26 INFO: Loading: lemma
2023-12-02 21:59:26 INFO: Loading: constituency
2023-12-02 21:59:26 INFO: Loading: depparse
2023-12-02 21:59:26 INFO: Loading: sentiment
2023-12-02 21:59:27 INFO: Loading: ner
2023-12-02 21:59:27 INFO: Done loading processors!




--------------
Parameters Logged in MLflow


--------------
Raw Dataframe:
(34756, 15)


--------------
Preprocessed Dataframe:
(34656, 13)


--------------
Dataframes logged as MLflow artifacts


100%|██████████| 34656/34656 [4:27:23<00:00,  2.16it/s]   




--------------
Word Matrix:


--------------
sklearn dill/pkls logged as MLflow artifacts
