# | default_exp core

In [None]:
# | hide
from bertopic import BERTopic
from bertopic.vectorizers import OnlineCountVectorizer
import dagshub
from datetime import datetime
import dill as pickle
import dvc.api
from hdbscan import HDBSCAN
from itertools import tee, islice
import mlflow
import nbdev
from nbdev.showdoc import *
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import (
    CountVectorizer
    , TfidfTransformer
    , TfidfVectorizer
    ,
)
from sklearn.pipeline import make_pipeline
from src.custom_stanza_mlflow import StanzaWrapper
import src.dataframe_preprocessor as dfpp
import stanza
import tqdm
from umap import UMAP

In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'

# | export
def foo():
    pass

In [None]:
# | hide
# this function allows us to get the experiment ID from an experiment name
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
def custom_analyzer(step_list, stanza_pipeline, minNgramLength, maxNgramLength):
    lowered = " brk ".join(map(str, [step for step in step_list if step is not None])).lower()

    preproc = stanza_pipeline(lowered)
    
    lemmad = " ".join(map(str,
                        [word.text
                        for sent in preproc.sentences 
                        for word in sent.words if (
                            word is not None
                        )]
                    )
                )
    
    # analyze each line of the input string seperately
    for ln in lemmad.split(' brk '):
        
        # tokenize the input string (customize the regex as desired)
        at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
        terms = re.split(at_least_two_english_characters_whole_words, ln)

        # loop ngram creation for every number between min and max ngram length
        for ngramLength in range(minNgramLength, maxNgramLength+1):

            # find and return all ngrams
            # for ngram in zip(*[terms[i:] for i in range(3)]): 
                # <-- solution without a generator (works the same but has higher memory usage)
            for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]):   # <-- solution using a generator
                
                ngram = ' '.join(map(str, ngram))
                # yield ngram
                return str(ngram)


In [None]:
# | hide
nbdev.nbdev_export()

# | Below this are blocks to use DagsHub with MLflow

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

#@markdown Enter the repo name 
DAGSHUB_REPO_NAME="MeaLeon"

#@markdown Enter the name of the branch you are working on 
BRANCH="STANZA-2/investigate_bertopic_compatibility"
dagshub.init(repo_name=DAGSHUB_REPO_NAME
             , repo_owner=DAGSHUB_USER_NAME)


In [None]:
# instantiate stanza pipeline
stanza.download('en')
nlp = stanza.Pipeline('en', 
                      depparse_batch_size=50, 
                      depparse_min_length_to_batch_separately=50,
                      verbose=True,
                      use_gpu=False,
                    #   batch_size=100
                      )

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-12 15:13:48 INFO: Downloading default packages for language: en (English) ...
2023-12-12 15:13:50 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2023-12-12 15:13:54 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2023-12-12 15:13:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-12 15:13:55 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-12-12 15:13:55 INFO: Using device: cpu
2023-12-12 15:13:55 INFO: Loading: tokenize
2023-12-12 15:13:55 INFO: Loading: pos
2023-12-12 15:13:55 INFO: Loading: lemma
2023-12-12 15:13:55 INFO: Loading: constituency
2023-12-12 15:13:56 INFO: Loading: depparse
2023-12-12 15:13:56 INFO: Loading: sentiment
2023-12-12 15:13:56 INFO: Loading: ner
2023-12-12 15:13:57 INFO: Done loading processors!


In [None]:
mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# starter idea for making an experiment name can be the git branch, but need more specificity
experiment_name = f"{DAGSHUB_EMAIL}/bertopic_stanza_small_set_v1"
mlflow_exp_id = get_experiment_id(experiment_name)

# run_id that has the logged info needed
mlflow_run_id = 'c6fbcf396af34ee3aade5503ee01c2bb'

In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# load dataframes from artifacts
# mlflow.artifacts.download_artifacts(
#     run_id=mlflow_run_id
# )

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':3,
}

# bertopic_params are a superset of cv_params
bertopic_params = {
    'top_n_words':20,
    'min_topic_size':5,
    'nr_topics':'auto',
    'verbose':True,
    'low_memory':True,
    'calculate_probabilities':True
}

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(cv_params)
pipeline_params.update(bertopic_params)

with mlflow.start_run(experiment_id=mlflow_exp_id):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print('Raw Dataframe:', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print('Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)


    # pre_proc_df = pd.read_json(
    #     mlflow.artifacts.download_artifacts(
    #         run_id=mlflow_run_id,
    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',
    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'
    #     )
    # )
    # print('\n')
    # print('-' * 80)
    # print('Preprocessed Dataframe:', end='\n')
    # print(pre_proc_df.head())
    # print(pre_proc_df.shape)

    # create subset for dev purposes
    to_nlp_df = pre_proc_df[0:50]
    print('\n')
    print('-' * 80)
    print('Subset Dataframe:', end='\n')
    print(to_nlp_df.head())
    print(to_nlp_df.shape)

    # LOG MODEL
    # Instantiate BERTopic
    topic_model = BERTopic(
        **bertopic_params,
    )

    def custom_analyzer(step_list, stanza_pipeline, minNgramLength, maxNgramLength):
            lowered = " brk ".join(map(str, [step for step in step_list if step is not None])).lower()

            preproc = stanza_pipeline(lowered)
            
            lemmad = " ".join(map(str,
                                [word.text
                                for sent in preproc.sentences 
                                for word in sent.words if (
                                    word is not None
                                )]
                            )
                        )
            
            # analyze each line of the input string seperately
            for ln in lemmad.split(' brk '):
                
                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngramLength in range(minNgramLength, maxNgramLength+1):

                    # find and return all ngrams
                    # for ngram in zip(*[terms[i:] for i in range(3)]): 
                        # <-- solution without a generator (works the same but has higher memory usage)
                    for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]):   # <-- solution using a generator
                        
                        ngram = ' '.join(map(str, ngram))
                        # yield ngram
                        return str(ngram)

    analyzer_kwargs = {'stanza_pipeline': nlp
                       , 'minNgramLength': 1
                       , 'maxNgramLength': 4}
    
    recipe_steps = to_nlp_df["prepSteps"].apply(custom_analyzer, **analyzer_kwargs)

    # recipe_steps = "".join(str(to_nlp_df["prepSteps"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))
    print('\n')
    print('-' * 80)
    print('Recipe steps:', end='\n')
    print(recipe_steps)

    # train on the recipes' steps
    topics, probs = topic_model.fit_transform(recipe_steps)

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn CountVectorizer
    # steps_vectorizer_model = CountVectorizer(**cv_params)

    # May need to use BERTopic's OnlineCountVectorizer
    steps_vectorizer_model = OnlineCountVectorizer(**cv_params)

    # Do fit transform on data
    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["steps"]))
    topic_model.update_topics(
        recipe_steps
        , vectorizer_model=steps_vectorizer_model
    )

    # Display topic model results
    print('\n')
    print('-' * 80)
    print('BERTopic Model Dataframe:', end='\n')
    print(topic_model.get_topic_info())

    print('\n')
    print('-' * 80)
    print('BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representation'])

    print('\n')
    print('-' * 80)
    print('BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representative_Docs'])

    # Save and log the topic model dataframe
    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_small_set_df.json')
    mlflow.log_artifact('../data/processed/bertopic_model_small_set_df.json',
                        artifact_path='bertopic_models')


In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# load dataframes from artifacts
# mlflow.artifacts.download_artifacts(
#     run_id=mlflow_run_id
# )

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

# bertopic_params are a superset of cv_params
bertopic_params = {
    'top_n_words':20,
    'min_topic_size':10,
    'nr_topics':'auto',
    'verbose':True,
    'low_memory':True,
    'calculate_probabilities':True
}

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(cv_params)
pipeline_params.update(bertopic_params)

with mlflow.start_run(experiment_id=get_experiment_id(f"{DAGSHUB_EMAIL}/bertopic_stanza_ingreds_full_set_v1")):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print('Raw Dataframe:', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print('Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)


    # pre_proc_df = pd.read_json(
    #     mlflow.artifacts.download_artifacts(
    #         run_id=mlflow_run_id,
    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',
    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'
    #     )
    # )
    # print('\n')
    # print('-' * 80)
    # print('Preprocessed Dataframe:', end='\n')
    # print(pre_proc_df.head())
    # print(pre_proc_df.shape)

    # create subset for dev purposes
    # to_nlp_df = pre_proc_df[0:50]
    # print('\n')
    # print('-' * 80)
    # print('Subset Dataframe:', end='\n')
    # print(to_nlp_df.head())
    # print(to_nlp_df.shape)

    # LOG MODEL
    # Instantiate BERTopic
    topic_model = BERTopic(
        **bertopic_params,
    )

    def custom_analyzer(step_list, stanza_pipeline, minNgramLength, maxNgramLength):
            lowered = " brk ".join(map(str, [step for step in step_list if step is not None])).lower()

            preproc = stanza_pipeline(lowered)
            
            lemmad = " ".join(map(str,
                                [word.text
                                for sent in preproc.sentences 
                                for word in sent.words if (
                                    word is not None
                                )]
                            )
                        )
            
            # analyze each line of the input string seperately
            for ln in lemmad.split(' brk '):
                
                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngramLength in range(minNgramLength, maxNgramLength+1):

                    # find and return all ngrams
                    # for ngram in zip(*[terms[i:] for i in range(3)]): 
                        # <-- solution without a generator (works the same but has higher memory usage)
                    for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]):   # <-- solution using a generator
                        
                        ngram = ' '.join(map(str, ngram))
                        # yield ngram
                        return str(ngram)

    analyzer_kwargs = {'stanza_pipeline': nlp
                       , 'minNgramLength': 1
                       , 'maxNgramLength': 4}
    
    recipe_ingreds = pre_proc_df["ingredients"].apply(custom_analyzer, **analyzer_kwargs)

    # recipe_steps = "".join(str(to_nlp_df["prepSteps"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))
    print('\n')
    print('-' * 80)
    print('Recipe ingredients:', end='\n')
    print(recipe_ingreds)

    # train on the recipes' steps
    topics, probs = topic_model.fit_transform(recipe_ingreds)

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn CountVectorizer
    # steps_vectorizer_model = CountVectorizer(**cv_params)

    # May need to use BERTopic's OnlineCountVectorizer
    steps_vectorizer_model = OnlineCountVectorizer(**cv_params)

    # Do fit transform on data
    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["steps"]))
    topic_model.update_topics(
        recipe_ingreds
        , vectorizer_model=steps_vectorizer_model
    )

    # Display topic model results
    print('\n')
    print('-' * 80)
    print('BERTopic Model Dataframe:', end='\n')
    print(topic_model.get_topic_info())

    print('\n')
    print('-' * 80)
    print('BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representation'])

    print('\n')
    print('-' * 80)
    print('BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representative_Docs'])

    # Save and log the topic model dataframe
    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')
    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',
                        artifact_path='bertopic_models')


### Attempt run with lighter weight configuration
#### This attempt will still use Stanza processing on the ingredients 

In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_nlp_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

# create sklearn pipeline as in BERTopic lightweight configuration
# pipe = make_pipeline(
#     TfidfVectorizer(**sklearn_nlp_params),
#     TruncatedSVD(100)
# )

# bertopic_params are a superset of cv_params
bertopic_params = {
    # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),
    'top_n_words':20,
    'min_topic_size':10,
    'nr_topics':50,
    'verbose':True,
    'low_memory':True,
    'calculate_probabilities':True,
    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic
}

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_nlp_params)
pipeline_params.update(bertopic_params)

with mlflow.start_run(experiment_id=get_experiment_id(f"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1")):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Raw Dataframe: ', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)


    # pre_proc_df = pd.read_json(
    #     mlflow.artifacts.download_artifacts(
    #         run_id=mlflow_run_id,
    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',
    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'
    #     )
    # )
    # print('\n')
    # print('-' * 80)
    # print('Preprocessed Dataframe:', end='\n')
    # print(pre_proc_df.head())
    # print(pre_proc_df.shape)

    # create subset for dev purposes
    to_nlp_df = pre_proc_df[0:100]
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, Subset Dataframe:', end='\n')
    print(to_nlp_df.head())
    print(to_nlp_df.shape)

    # LOG MODEL
    # Instantiate BERTopic
    topic_model = BERTopic(
        **bertopic_params
    )
    
    analyzer_kwargs = {'stanza_pipeline': nlp
                       , 'minNgramLength': 1
                       , 'maxNgramLength': 4}
    
    recipe_ingreds = to_nlp_df["ingredients"].apply(custom_analyzer, **analyzer_kwargs)

    # Create TF-IDF embeddings
    vectorizer = TfidfVectorizer(**sklearn_nlp_params)
    embeddings = vectorizer.fit_transform(recipe_ingreds)

    # recipe_steps = "".join(str(to_nlp_df["prepSteps"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, Recipe ingredients:', end='\n')
    print(recipe_ingreds)

    # train on the recipes' ingredientss
    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn CountVectorizer
    sklearn_cv_params = {
        'strip_accents':"unicode",
        'lowercase':True,
        'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
        # 'min_df':10,
    }
    steps_vectorizer_model = CountVectorizer(**sklearn_cv_params)

    # May need to use BERTopic's OnlineCountVectorizer
    # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)

    # Do fit transform on data
    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["steps"]))
    topic_model.update_topics(
        recipe_ingreds
        , vectorizer_model=steps_vectorizer_model
    )

    # Display topic model results
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\n')
    print(topic_model.get_topic_info())

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representation'])

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\n')
    print(topic_model.get_topic_info()['Representative_Docs'])

    # Save and log the topic model dataframe
    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')
    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',
                        artifact_path='bertopic_models')




--------------
2023-12-13 16:53:21.010671, Raw Dataframe: 
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000

2023-12-13 17:20:01,611 - BERTopic - Reduced dimensionality
2023-12-13 17:20:01,626 - BERTopic - Clustered reduced embeddings
2023-12-13 17:20:01,637 - BERTopic - Reduced number of topics from 5 to 5




--------------------------------------------------------------------------------
2023-12-13 17:29:06.479388, BERTopic Model Dataframe:
   Topic  Count                                               Name  \
0     -1      5  -1_o None and None e None None p None u None r...   
1      0     19  0_None_3 None None None 2 None None c None u N...   
2      1     51  1_for_1 None None t None a None billion None N...   
3      2     12  2_3 None None p None o None u None and None wo...   
4      3     13  3_1 None None None None None c None u None p N...   

                                      Representation  \
0  [o None and None e None None p None u None r N...   
1  [None, 3 None None None 2 None None c None u N...   
2  [for, 1 None None t None a None billion None N...   
3  [3 None None p None o None u None and None wou...   
4  [1 None None None None None c None u None p No...   

                                 Representative_Docs  
0  [one purchased 9 - inch angel food cake, 1 pou.

In [None]:
topic_model.get_topic_info()['Representation']

0    [o None and None e None None p None u None r N...
1    [None, 3 None None None 2 None None c None u N...
2    [for, 1 None None t None a None billion None N...
3    [3 None None p None o None u None and None wou...
4    [1 None None None None None c None u None p No...
Name: Representation, dtype: object

In [None]:
topic_model.get_topic_info()['Representation'][0]

['o None and None e None None p None u None r None c None h None a None s None e None would None None 9 None None None I None and None c None h None None a None and None gram None e None None None None for None o None o None would None None c None a None thousand None e None None 1 None 4 None None t None s None p None None s None a None for None for None r None o None and None None t None h None r None e None a None would None s None None 2 None 3 None None o None z None None would None r None I None e None would None None million None o None r None e None None None None o None r None None p None o None r None c None I None and None I None None million None u None s None h None r None o None o None million None s None None 1 None None p None o None u None and None would None None e None a None c None h None None s None with None e None e None t None None a None and None would None None h None o None t None None I None t None a None None None I None a None and None None s None a None u

In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_nlp_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

# create sklearn pipeline as in BERTopic lightweight configuration
# pipe = make_pipeline(
#     TfidfVectorizer(**sklearn_nlp_params),
#     TruncatedSVD(100)
# )

# bertopic_params are a superset of cv_params
bertopic_params = {
    # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),
    'top_n_words':20,
    'min_topic_size':10,
    'nr_topics':50,
    'verbose':True,
    'low_memory':True,
    'calculate_probabilities':True,
    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic
}

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_nlp_params)
pipeline_params.update(bertopic_params)

with mlflow.start_run(experiment_id=get_experiment_id(f"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1")):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Raw Dataframe: ', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)


    # pre_proc_df = pd.read_json(
    #     mlflow.artifacts.download_artifacts(
    #         run_id=mlflow_run_id,
    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',
    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'
    #     )
    # )
    # print('\n')
    # print('-' * 80)
    # print('Preprocessed Dataframe:', end='\n')
    # print(pre_proc_df.head())
    # print(pre_proc_df.shape)

    # create subset for dev purposes
    to_nlp_df = pre_proc_df[0:100]
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, Subset Dataframe:', end='\n')
    print(to_nlp_df.head())
    print(to_nlp_df.shape)

    # LOG MODEL
    # Instantiate BERTopic
    topic_model = BERTopic(
        **bertopic_params
    )
    
    analyzer_kwargs = {'stanza_pipeline': nlp
                       , 'minNgramLength': 1
                       , 'maxNgramLength': 4}
    
    recipe_ingreds = to_nlp_df["ingredients"].apply(custom_analyzer, **analyzer_kwargs)

    # Create TF-IDF embeddings
    vectorizer = TfidfVectorizer(**sklearn_nlp_params)
    embeddings = vectorizer.fit_transform(recipe_ingreds)

    # recipe_steps = "".join(str(to_nlp_df["prepSteps"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, Recipe ingredients:', end='\n')
    print(recipe_ingreds)

    # train on the recipes' ingredientss
    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn CountVectorizer
    sklearn_cv_params = {
        'strip_accents':"unicode",
        'lowercase':True,
        # 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
        # 'min_df':10,
    }
    steps_vectorizer_model = CountVectorizer(**sklearn_cv_params)

    # May need to use BERTopic's OnlineCountVectorizer
    # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)

    # Do fit transform on data
    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["steps"]))
    topic_model.update_topics(
        recipe_ingreds
        , vectorizer_model=steps_vectorizer_model
    )

    # Display topic model results
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\n')
    print(topic_model.get_topic_info())

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representation'])

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\n')
    print(topic_model.get_topic_info()['Representative_Docs'])

    # Save and log the topic model dataframe
    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')
    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',
                        artifact_path='bertopic_models')




--------------
2023-12-13 17:55:54.774069, Raw Dataframe: 
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000

2023-12-13 18:16:51,525 - BERTopic - Reduced dimensionality
2023-12-13 18:16:51,537 - BERTopic - Clustered reduced embeddings
2023-12-13 18:16:51,549 - BERTopic - Reduced number of topics from 3 to 3




--------------------------------------------------------------------------------
2023-12-13 18:16:51.565220, BERTopic Model Dataframe:
   Topic  Count                          Name  \
0      0     19        0_12_all_purpose_flour   
1      1     52  1_tablespoons_or_dried_about   
2      2     29        2_oil_12_cups_nonstick   

                                      Representation  \
0  [12, all, purpose, flour, cups, pounds, in, 14...   
1  [tablespoons, or, dried, about, large, and, di...   
2  [oil, 12, cups, nonstick, eggs, spray, vegetab...   

                                 Representative_Docs  
0  [2 1/4 cups all purpose flour, 2 1/2 cups all ...  
1  [4 large dried guajillo chiles or dried new me...  
2  [nonstick vegetable oil spray, nonstick vegeta...  


--------------------------------------------------------------------------------
2023-12-13 18:16:51.576763, BERTopic Model Representations:
0    [12, all, purpose, flour, cups, pounds, in, 14...
1    [tablespoons, or, 

In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_nlp_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

# create sklearn pipeline as in BERTopic lightweight configuration
# pipe = make_pipeline(
#     TfidfVectorizer(**sklearn_nlp_params),
#     TruncatedSVD(100)
# )

# bertopic_params are a superset of cv_params
bertopic_params = {
    # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),
    'top_n_words':20,
    'min_topic_size':10,
    'nr_topics':50,
    'verbose':True,
    'low_memory':True,
    'calculate_probabilities':True,
    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic
}

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_nlp_params)
pipeline_params.update(bertopic_params)

with mlflow.start_run(experiment_id=get_experiment_id(f"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_small_set_v1.01")):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Raw Dataframe: ', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)


    # pre_proc_df = pd.read_json(
    #     mlflow.artifacts.download_artifacts(
    #         run_id=mlflow_run_id,
    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',
    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'
    #     )
    # )
    # print('\n')
    # print('-' * 80)
    # print('Preprocessed Dataframe:', end='\n')
    # print(pre_proc_df.head())
    # print(pre_proc_df.shape)

    # create subset for dev purposes
    to_nlp_df = pre_proc_df[0:100]
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, Subset Dataframe:', end='\n')
    print(to_nlp_df.head())
    print(to_nlp_df.shape)

    # LOG MODEL
    # Instantiate BERTopic
    topic_model = BERTopic(
        **bertopic_params
    )
    
    analyzer_kwargs = {'stanza_pipeline': nlp
                       , 'minNgramLength': 1
                       , 'maxNgramLength': 4}
    
    recipe_ingreds = to_nlp_df["ingredients"].apply(custom_analyzer, **analyzer_kwargs)

    # Create TF-IDF embeddings
    vectorizer = TfidfVectorizer(**sklearn_nlp_params)
    embeddings = vectorizer.fit_transform(recipe_ingreds)

    # recipe_steps = "".join(str(to_nlp_df["prepSteps"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, Recipe ingredients:', end='\n')
    print(recipe_ingreds)

    # train on the recipes' ingredientss
    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn CountVectorizer
    sklearn_cv_params = {
        'strip_accents':"unicode",
        'lowercase':True,
        # 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
        # 'min_df':10,
        # 'token_pattern': "(?u)\b[a-zA-Z]{2,}\b"
    }
    steps_vectorizer_model = CountVectorizer(**sklearn_cv_params)

    # May need to use BERTopic's OnlineCountVectorizer
    # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)

    # Do fit transform on data
    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["steps"]))
    topic_model.update_topics(
        recipe_ingreds
        , vectorizer_model=steps_vectorizer_model
    )

    # Display topic model results
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\n')
    print(topic_model.get_topic_info())

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representation'])

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\n')
    print(topic_model.get_topic_info()['Representative_Docs'])

    # Save and log the topic model dataframe
    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')
    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',
                        artifact_path='bertopic_models')




--------------
2023-12-13 22:25:51.013932, Raw Dataframe: 
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000

2023-12-13 22:37:46,398 - BERTopic - Reduced dimensionality
2023-12-13 22:37:46,404 - BERTopic - Clustered reduced embeddings
2023-12-13 22:37:46,413 - BERTopic - Reduced number of topics from 4 to 4


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_nlp_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

# create sklearn pipeline as in BERTopic lightweight configuration
# pipe = make_pipeline(
#     TfidfVectorizer(**sklearn_nlp_params),
#     TruncatedSVD(100)
# )

# bertopic_params are a superset of cv_params
bertopic_params = {
    # 'embedding_model': TfidfVectorizer(**sklearn_nlp_params),
    'top_n_words':20,
    'min_topic_size':10,
    'nr_topics':50,
    'verbose':True,
    'low_memory':True,
    'calculate_probabilities':True,
    # 'min_cluster_size': 10 # Possibly only works if modifying individual HDBSCAN component of BERTopic
}

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'TfidfVectorizer'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_nlp_params)
pipeline_params.update(bertopic_params)

with mlflow.start_run(experiment_id=get_experiment_id(f"{DAGSHUB_EMAIL}/bertopic_lightweight_stanza_ingreds_full_set_v1.00")):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    
    # load raw data and preprocess/clean
    data = dvc.api.read(
           path='../data/recipes-en-201706/epicurious-recipes_m2.json'
           , mode='r')
    raw_df = pd.read_json(data)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Raw Dataframe: ', end='\n')
    print(raw_df.head())
    print(raw_df.shape)

    # pre_proc_df is cleaned dataframe
    pre_proc_df = dfpp.preprocess_dataframe(raw_df)
    print('\n')
    print('--------------')
    print(f'{datetime.now()}, Preprocessed Dataframe:', end='\n')
    print(pre_proc_df.head())
    print(pre_proc_df.shape)


    # pre_proc_df = pd.read_json(
    #     mlflow.artifacts.download_artifacts(
    #         run_id=mlflow_run_id,
    #         artifact_path='artifacts/preprocessed_dataframes/preprocessed_dataframe.json',
    #         # tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow'
    #     )
    # )
    # print('\n')
    # print('-' * 80)
    # print('Preprocessed Dataframe:', end='\n')
    # print(pre_proc_df.head())
    # print(pre_proc_df.shape)

    # create subset for dev purposes
    # to_nlp_df = pre_proc_df[0:100]
    # print('\n')
    # print('-' * 80)
    # print(f'{datetime.now()}, Subset Dataframe:', end='\n')
    # print(to_nlp_df.head())
    # print(to_nlp_df.shape)

    # LOG MODEL
    # Instantiate BERTopic
    topic_model = BERTopic(
        **bertopic_params
    )
    
    analyzer_kwargs = {'stanza_pipeline': nlp
                       , 'minNgramLength': 1
                       , 'maxNgramLength': 4}
    
    recipe_ingreds = pre_proc_df["ingredients"].apply(custom_analyzer, **analyzer_kwargs)

    # Create TF-IDF embeddings
    vectorizer = TfidfVectorizer(**sklearn_nlp_params)
    embeddings = vectorizer.fit_transform(recipe_ingreds)

    # recipe_steps = "".join(str(to_nlp_df["prepSteps"].apply(StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4))))
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, Recipe ingredients:', end='\n')
    print(recipe_ingreds)

    # train on the recipes' ingredientss
    topics, probs = topic_model.fit_transform(recipe_ingreds, embeddings)

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    # Instantiate sklearn CountVectorizer
    sklearn_cv_params = {
        'strip_accents':"unicode",
        'lowercase':True,
        # 'analyzer': StanzaWrapper().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
        # 'min_df':10,
        # 'token_pattern': "(?u)\b[a-zA-Z]{2,}\b"
    }
    ingreds_vectorizer_model = CountVectorizer(**sklearn_cv_params)

    # May need to use BERTopic's OnlineCountVectorizer
    # steps_vectorizer_model = OnlineCountVectorizer(**sklearn_nlp_params)

    # Do fit transform on data
    # steps_test_tfidf_transform = steps_tfidf_vectorizer_model.fit_transform(tqdm(to_nlp_df["steps"]))
    topic_model.update_topics(
        recipe_ingreds
        , vectorizer_model=ingreds_vectorizer_model
    )

    # Display topic model results
    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Dataframe:', end='\n')
    print(topic_model.get_topic_info())

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representations:', end='\n')
    print(topic_model.get_topic_info()['Representation'])

    print('\n')
    print('-' * 80)
    print(f'{datetime.now()}, BERTopic Model Representative Docs:', end='\n')
    print(topic_model.get_topic_info()['Representative_Docs'])

    # Save and log the topic model dataframe
    topic_model.get_topic_info().to_json('../data/processed/bertopic_model_ingreds_full_set_df.json')
    mlflow.log_artifact('../data/processed/bertopic_model_ingreds_full_set_df.json',
                        artifact_path='bertopic_models')




--------------
2023-12-13 22:45:20.202117, Raw Dataframe: 
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000

: 