# core

> test

# | default_exp core

In [None]:
# !spacy download en_core_web_sm

In [None]:
# | hide
from nbdev.showdoc import *
import project_path

from bertopic import BERTopic
import dagshub
# from datetime import datetime
from hdbscan import HDBSCAN
import joblib 
import json
# import matplotlib.pyplot as plt
import mlflow
from mlflow.models.signature import infer_signature
import numpy as np
import pandas as pd
# from sklearn.base import TransformerMixin
# from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import (
    CountVectorizer,
    # TfidfTransformer,
    # TfidfVectorizer,
)
from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
import spacy
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
# from spacy.lemmatizer import Lemmatizer
from tqdm import tqdm
from typing import Any
from umap import UMAP

# import local scripts
# import src.nlp_processor as nlpp

nlp = spacy.load("en_core_web_sm")
nlp.max_length= 9000000

In [None]:
def custom_lemmatizer(ingredients: list) -> Any:  # spacy nlp.Doc
    """This takes in a string representing the recipe and an NLP model and lemmatize with the NER.

    Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    Remove punctuation

    Args:
        ingredients: string
        nlp_mod: spacy model (try built in first, by default called nlp)

    Returns:
        nlp.Doc
    """
    lemmas = [
        token.lemma_
        for token in ingredients
        if (
            token.is_alpha
            and token.pos_ not in ["PRON", "VERB"]
            and len(token.lemma_) > 1
        )
    ]
    return lemmas
    # return doc

In [None]:
def custom_preprocessor(recipe_ingreds: str) -> list:
    """This function replaces the default sklearn CountVectorizer preprocessor to use spaCy. sklearn CountVectorizer's preprocessor only performs accent removal and lowercasing.

    Args:
        A string to tokenize from a recipe representing the ingredients used in the recipe

    Returns:
        A list of strings that have been de-accented and lowercased to be used in tokenization
    """
    preprocessed = [token for token in nlp(recipe_ingreds)]

    return preprocessed

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

DAGSHUB_REPO_NAME="MeaLeon"
BRANCH="venv4/add-try-mlflow"
dagshub.init(repo_name='MeaLeon', repo_owner='AaronWChen')


In [None]:
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
raw_data_path = '../data/recipes-en-201706/epicurious-recipes_m2.json'
food_stopwords_path = "../food_stopwords.csv"

joblib_basepath = '../joblib/2022.08.23/'

cv_path = joblib_basepath + 'countvec.joblib'
tfidf_path = joblib_basepath + 'tfidf.joblib'
full_df_path = joblib_basepath + 'recipes_with_cv.joblib'
reduced_df_path = joblib_basepath + 'reduced_df.joblib'

In [None]:

# this is a redeem for variable naming mixed with a free pun-ish me daddy, flushtrated will be the list of all stopword to exclude so named because we're throwing these words down the drain

flushtrated = {x for x in pd.read_csv(food_stopwords_path)}
additional_to_exclude = {
    "red",
    "green",
    "black",
    "yellow",
    "white",
    "inch",
    "mince",
    "chop",
    "fry",
    "trim",
    "flat",
    "beat",
    "brown",
    "golden",
    "balsamic",
    "halve",
    "blue",
    "divide",
    "trim",
    "unbleache",
    "granulate",
    "Frank",
    "alternative",
    "american",
    "annie",
    "asian",
    "balance",
    "band",
    "barrel",
    "bay",
    "bayou",
    "beam",
    "beard",
    "bell",
    "betty",
    "bird",
    "blast",
    "bob",
    "bone",
    "breyers",
    "calore",
    "carb",
    "card",
    "chachere",
    "change",
    "circle",
    "coffee",
    "coil",
    "country",
    "cow",
    "crack",
    "cracker",
    "crocker",
    "crystal",
    "dean",
    "degree",
    "deluxe",
    "direction",
    "duncan",
    "earth",
    "eggland",
    "ener",
    "envelope",
    "eye",
    "fantastic",
    "far",
    "fat",
    "feather",
    "flake",
    "foot",
    "fourth",
    "frank",
    "french",
    "fusion",
    "genoa",
    "genovese",
    "germain",
    "giada",
    "gold",
    "granule",
    "greek",
    "hamburger",
    "helper",
    "herbe",
    "hines",
    "hodgson",
    "hunt",
    "instruction",
    "interval",
    "italianstyle",
    "jim",
    "jimmy",
    "kellogg",
    "lagrille",
    "lake",
    "land",
    "laurentiis",
    "lawry",
    "lipton",
    "litre",
    "ll",
    "maid",
    "malt",
    "mate",
    "mayer",
    "meal",
    "medal",
    "medallion",
    "member",
    "mexicanstyle",
    "monte",
    "mori",
    "nest",
    "nu",
    "oounce",
    "oscar",
    "ox",
    "paso",
    "pasta",
    "patty",
    "petal",
    "pinche",
    "preserve",
    "quartere",
    "ranch",
    "ranchstyle",
    "rasher",
    "redhot",
    "resemble",
    "rice",
    "ro",
    "roni",
    "scissor",
    "scrap",
    "secret",
    "semicircle",
    "shard",
    "shear",
    "sixth",
    "sliver",
    "smucker",
    "snicker",
    "source",
    "spot",
    "state",
    "strand",
    "sun",
    "supreme",
    "tablepoon",
    "tail",
    "target",
    "tm",
    "tong",
    "toothpick",
    "triangle",
    "trimming",
    "tweezer",
    "valley",
    "vay",
    "wise",
    "wishbone",
    "wrapper",
    "yoplait",
    "ziploc",
}

flushtrated = flushtrated.union(STOP_WORDS)
flushtrated = flushtrated.union(additional_to_exclude)
flushtrated_list = list(flushtrated)

In [None]:
full_df = joblib.load(full_df_path)
full_df

Unnamed: 0,id,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,cuisine_name,photo_filename,...,zest pith,zest vegetable,zinfandel,ziti,zucchini,zucchini blossom,zucchini crookneck,zucchini squash,árbol,árbol pepper
0,54a2b6b019925f464b373351,How does fried chicken achieve No. 1 status? B...,Pickle-Brined Fried Chicken,3.11,"[1 tablespoons yellow mustard seeds, 1 tablesp...",[Toast mustard and coriander seeds in a dry me...,7,100,Missing Cuisine,51247610_fried-chicken_1x1.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,54a408a019925f464b3733bc,Spinaci all'Ebraica,Spinach Jewish Style,3.22,"[3 pounds small-leaved bulk spinach, Salt, 1/2...",[Remove the stems and roots from the spinach. ...,5,80,Italian,EP_12162015_placeholders_rustic.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,54a408a26529d92b2c003631,"This majestic, moist, and richly spiced honey ...",New Year’s Honey Cake,3.62,"[3 1/2 cups all-purpose flour, 1 tablespoon ba...",[I like this cake best baked in a 9-inch angel...,105,88,Kosher,EP_09022015_honeycake-2.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,54a408a66529d92b2c003638,The idea for this sandwich came to me when my ...,The B.L.A.Bagel with Lox and Avocado,4.00,"[1 small ripe avocado, preferably Hass (see No...","[A short time before serving, mash avocado and...",7,100,Kosher,EP_12162015_placeholders_casual.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,54a408a719925f464b3733cc,"In 1930, Simon Agranat, the chief justice of t...",Shakshuka a la Doktor Shakshuka,2.71,"[2 pounds fresh tomatoes, unpeeled and cut in ...","[1. Place the tomatoes, garlic, salt, paprika,...",7,83,Kosher,EP_12162015_placeholders_formal.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34751,59541a31bff3052847ae2107,Buttering the bread before you waffle it ensur...,Waffled Ham and Cheese Melt with Maple Butter,0.00,"[1 tablespoon unsalted butter, at room tempera...","[Preheat the waffle iron on low., Spread a thi...",0,0,Missing Cuisine,waffle-ham-and-cheese-melt-062817.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34752,5954233ad52ca90dc28200e7,"Spread this easy compound butter on waffles, p...",Maple Butter,0.00,"[8 tablespoons (1 stick) salted butter, at roo...",[Combine the ingredients in a medium-size bowl...,0,0,Missing Cuisine,EP_12162015_placeholders_bright.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34753,595424c2109c972493636f83,Leftover mac and cheese is not exactly one of ...,Waffled Macaroni and Cheese,0.00,"[3 tablespoons unsalted butter, plus more for ...",[Preheat the oven to 375°F. Butter a 9x5-inch ...,0,0,Missing Cuisine,waffle-mac-n-cheese-062816.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34754,5956638625dc3d1d829b7166,A classic Mexican beer cocktail you can sip al...,Classic Michelada,0.00,"[Coarse salt, 2 lime wedges, 2 ounces tomato j...",[Place about 1/4 cup salt on a small plate. Ru...,0,0,Missing Cuisine,Classic Michelada 07292017.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
full_df['prepSteps']

0        [Toast mustard and coriander seeds in a dry me...
1        [Remove the stems and roots from the spinach. ...
2        [I like this cake best baked in a 9-inch angel...
3        [A short time before serving, mash avocado and...
4        [1. Place the tomatoes, garlic, salt, paprika,...
                               ...                        
34751    [Preheat the waffle iron on low., Spread a thi...
34752    [Combine the ingredients in a medium-size bowl...
34753    [Preheat the oven to 375°F. Butter a 9x5-inch ...
34754    [Place about 1/4 cup salt on a small plate. Ru...
34755    [Combine the water, honey, rosemary, and grape...
Name: prepSteps, Length: 34656, dtype: object

In [None]:
full_df['prepSteps'][0]

['Toast mustard and coriander seeds in a dry medium saucepan over medium heat, tossing often, until mustard seeds begin to pop, about 3 minutes. Add vinegar, salt, and sugar and bring to a boil. Reduce heat and simmer, stirring often, until salt and sugar are dissolved, about 4 minutes. Remove from heat; stir in dill and 4 cups water. Let cool.',
 'Place chicken and brine in a large resealable plastic bag; chill 3 hours. Remove chicken from brine, scraping off seeds, cover, and chill until ready to fry.',
 'Fit a large pot with thermometer and pour in oil to measure 2". Heat over medium-high heat until thermometer registers 350°F.',
 'Meanwhile, place buttermilk in a large bowl. Place flour in another large bowl; season with kosher salt. Working in batches, coat chicken in buttermilk, then dredge in flour, dipping your fingers in buttermilk as you pack flour on to help create moistened, shaggy bits (the makings of a super-crisp crust); transfer to a baking sheet.',
 'Working in batches

In [None]:
recipe_steps = full_df['prepSteps'].apply(" ".join).str.lower()

In [None]:
recipe_steps

0        toast mustard and coriander seeds in a dry med...
1        remove the stems and roots from the spinach. r...
2        i like this cake best baked in a 9-inch angel ...
3        a short time before serving, mash avocado and ...
4        1. place the tomatoes, garlic, salt, paprika, ...
                               ...                        
34751    preheat the waffle iron on low. spread a thin,...
34752    combine the ingredients in a medium-size bowl ...
34753    preheat the oven to 375°f. butter a 9x5-inch l...
34754    place about 1/4 cup salt on a small plate. rub...
34755    combine the water, honey, rosemary, and grapef...
Name: prepSteps, Length: 34656, dtype: object

In [None]:
# pre-calculating sentence mebeddings
# embedding_model_params = {'embedding_model': 'all-MiniLM-L6-v2'}
# embedding_model = SentenceTransformer(embedding_model_params['embedding_model'])
# embeddings = embedding_model.encode(recipe_steps, show_progress_bar=True)

In [None]:
# specify UMAP dimensionality reductions
# umap_model_params = {'n_neighbors':15, 'n_components':10, 'random_state':200}
# umap_model = UMAP(**umap_model_params)

In [None]:
# cluster with HDBSCAN
# hdbscan_model_params = {'min_cluster_size':200, 'prediction_data':True}
# hdbscan_model = HDBSCAN(**hdbscan_model_params)

In [None]:
# adding custom count vectorization
cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    # 'preprocessor':custom_preprocessor,
    # 'tokenizer':custom_lemmatizer, # out of memory 
    'stop_words':flushtrated_list,
    'token_pattern':r"(?u)\b[a-zA-Z]{2,}\b",
    'ngram_range':(1, 4),
    'min_df':10,
}

vectorizer_model = CountVectorizer(**cv_params)

SyntaxError: incomplete input (347404520.py, line 13)

In [None]:
# # with open('../data/processed/bertopic_params.joblib', 'w') as fp:
# pipeline_params = {
#     'embedding':{'pretrained_sentence_embeddings': embedding_model_params},
#     'dimension_reduction': {'UMAP': umap_model_params},
#     'clustering': {'HDBSCAN': hdbscan_model_params},
#     'vectorizer': {'sklearn_countvectorizer': cv_params},
# }
# joblib.dump(pipeline_params, '../data/processed/bertopic_params.joblib')

['../data/processed/bertopic_params.joblib']

In [None]:
with mlflow.start_run(experiment_id=get_experiment_id("initial_explicit_spec_run_2")):
    pipeline_params = {
        'language':'english',
        'top_n_words':20,
        'n_gram_range':(1, 4),
        'min_topic_size':200,
        'nr_topics':'auto',
        'verbose':True,
        'low_memory':True,
        'calculate_probabilities':True
    }
    mlflow.log_params(pipeline_params)
    
    topic_model = BERTopic(
        **pipeline_params
        vectorizer_model=vectorizer_model
    )
    # TOKENIZERS_PARALLELISM=False

    topics, probs = topic_model.fit_transform(recipe_steps)

    topic_model.get_topic_info().to_json('../data/processed/topic_model_df.json')

    # mlflow.log_artifact('../data/processed/bertopic_params.joblib')
    mlflow.log_artifact('../data/processed/topic_model_df.json')

    print(topic_model.get_topic_info())

Batches:   0%|          | 0/1083 [00:00<?, ?it/s]

2023-07-13 18:32:34,064 - BERTopic - Transformed documents to Embeddings
2023-07-13 18:32:58,935 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-07-13 18:33:03,405 - BERTopic - Clustered reduced embeddings
2023-07-13 18:34:15,747 - BERTopic - Reduced number of topics from 30 to 30


    Topic  Count                          Name  \
0      -1   8301              -1_and_to_the_in   
1       0  12316               0_and_the_in_to   
2       1   1682          1_chicken_and_to_the   
3       2   1674            2_and_in_bowl_with   
4       3   1128          3_potatoes_and_to_in   
5       4    751           4_fish_and_the_with   
6       5    699             5_pork_and_to_the   
7       6    534            6_shrimp_and_to_in   
8       7    502          7_bacon_and_to_until   
9       8    490         8_steak_steaks_and_to   
10      9    482              9_soup_and_in_to   
11     10    482           10_lamb_and_to_with   
12     11    469           11_pasta_and_to_add   
13     12    443            12_beef_and_to_the   
14     13    437           13_beans_and_to_the   
15     14    433      14_bread_and_with_slices   
16     15    378            15_and_heat_add_to   
17     16    372        16_salmon_and_with_the   
18     17    317          17_turkey_and_to_the   


In [None]:
# topics, probs = topic_model.fit_transform(recipe_steps)

In [None]:
print(topic_model.get_topic_info().to_string())

    Topic  Count                          Name                                                                                                                                                         Representation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [None]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [None]:
print_full(topic_model.get_topic_info())

    Topic  Count                          Name  \
0      -1   8301              -1_and_to_the_in   
1       0  12316               0_and_the_in_to   
2       1   1682          1_chicken_and_to_the   
3       2   1674            2_and_in_bowl_with   
4       3   1128          3_potatoes_and_to_in   
5       4    751           4_fish_and_the_with   
6       5    699             5_pork_and_to_the   
7       6    534            6_shrimp_and_to_in   
8       7    502          7_bacon_and_to_until   
9       8    490         8_steak_steaks_and_to   
10      9    482              9_soup_and_in_to   
11     10    482           10_lamb_and_to_with   
12     11    469           11_pasta_and_to_add   
13     12    443            12_beef_and_to_the   
14     13    437           13_beans_and_to_the   
15     14    433      14_bread_and_with_slices   
16     15    378            15_and_heat_add_to   
17     16    372        16_salmon_and_with_the   
18     17    317          17_turkey_and_to_the   


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(topic_model.get_topic_info())

    Topic  Count                          Name  \
0      -1   8301              -1_and_to_the_in   
1       0  12316               0_and_the_in_to   
2       1   1682          1_chicken_and_to_the   
3       2   1674            2_and_in_bowl_with   
4       3   1128          3_potatoes_and_to_in   
5       4    751           4_fish_and_the_with   
6       5    699             5_pork_and_to_the   
7       6    534            6_shrimp_and_to_in   
8       7    502          7_bacon_and_to_until   
9       8    490         8_steak_steaks_and_to   
10      9    482              9_soup_and_in_to   
11     10    482           10_lamb_and_to_with   
12     11    469           11_pasta_and_to_add   
13     12    443            12_beef_and_to_the   
14     13    437           13_beans_and_to_the   
15     14    433      14_bread_and_with_slices   
16     15    378            15_and_heat_add_to   
17     16    372        16_salmon_and_with_the   
18     17    317          17_turkey_and_to_the   


# | export
def foo():
    pass

In [None]:
# | hide
# import nbdev

# nbdev.nbdev_export()