# core

> test

# | default_exp core

In [1]:
# !spacy download en_core_web_sm

In [None]:
# | hide
from nbdev.showdoc import *
import project_path

from bertopic import BERTopic
import dagshub
# from datetime import datetime
from hdbscan import HDBSCAN
import joblib 
import json
# import matplotlib.pyplot as plt
import mlflow
from mlflow.models.signature import infer_signature
import numpy as np
import pandas as pd
# from sklearn.base import TransformerMixin
# from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import (
    CountVectorizer,
    # TfidfTransformer,
    # TfidfVectorizer,
)
from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
import spacy
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
# from spacy.lemmatizer import Lemmatizer
from tqdm import tqdm
from typing import Any
from umap import UMAP

# import local scripts
# import src.nlp_processor as nlpp

nlp = spacy.load("en_core_web_sm")
nlp.max_length= 10000000
TOKENIZERS_PARALLELISM=False

In [None]:
def custom_lemmatizer(ingredients: list) -> Any:  # spacy nlp.Doc
    """This takes in a string representing the recipe and an NLP model and lemmatize with the NER.

    Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    Remove punctuation

    Args:
        ingredients: string
        nlp_mod: spacy model (try built in first, by default called nlp)

    Returns:
        nlp.Doc
    """
    lemmas = [
        token.lemma_
        for token in ingredients
        if (
            token.is_alpha
            and token.pos_ not in ["PRON", "VERB"]
            and len(token.lemma_) > 1
        )
    ]
    return lemmas
    # return doc

In [None]:
def custom_preprocessor(recipe_ingreds: str) -> list:
    """This function replaces the default sklearn CountVectorizer preprocessor to use spaCy. sklearn CountVectorizer's preprocessor only performs accent removal and lowercasing.

    Args:
        A string to tokenize from a recipe representing the ingredients used in the recipe

    Returns:
        A list of strings that have been de-accented and lowercased to be used in tokenization
    """
    preprocessed = [token for token in nlp(recipe_ingreds)]

    return preprocessed

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

DAGSHUB_REPO_NAME="MeaLeon"
BRANCH="venv4/add-try-mlflow"
dagshub.init(repo_name='MeaLeon', repo_owner='AaronWChen')


In [None]:
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
raw_data_path = '../data/recipes-en-201706/epicurious-recipes_m2.json'
food_stopwords_path = "../food_stopwords.csv"

joblib_basepath = '../joblib/2022.08.23/'

cv_path = joblib_basepath + 'countvec.joblib'
tfidf_path = joblib_basepath + 'tfidf.joblib'
full_df_path = joblib_basepath + 'recipes_with_cv.joblib'
reduced_df_path = joblib_basepath + 'reduced_df.joblib'

In [None]:

# this is a redeem for variable naming mixed with a free pun-ish me daddy, flushtrated will be the list of all stopword to exclude so named because we're throwing these words down the drain

flushtrated = {x for x in pd.read_csv(food_stopwords_path)}
additional_to_exclude = {
    "red",
    "green",
    "black",
    "yellow",
    "white",
    "inch",
    "mince",
    "chop",
    "fry",
    "trim",
    "flat",
    "beat",
    "brown",
    "golden",
    "balsamic",
    "halve",
    "blue",
    "divide",
    "trim",
    "unbleache",
    "granulate",
    "Frank",
    "alternative",
    "american",
    "annie",
    "asian",
    "balance",
    "band",
    "barrel",
    "bay",
    "bayou",
    "beam",
    "beard",
    "bell",
    "betty",
    "bird",
    "blast",
    "bob",
    "bone",
    "breyers",
    "calore",
    "carb",
    "card",
    "chachere",
    "change",
    "circle",
    "coffee",
    "coil",
    "country",
    "cow",
    "crack",
    "cracker",
    "crocker",
    "crystal",
    "dean",
    "degree",
    "deluxe",
    "direction",
    "duncan",
    "earth",
    "eggland",
    "ener",
    "envelope",
    "eye",
    "fantastic",
    "far",
    "fat",
    "feather",
    "flake",
    "foot",
    "fourth",
    "frank",
    "french",
    "fusion",
    "genoa",
    "genovese",
    "germain",
    "giada",
    "gold",
    "granule",
    "greek",
    "hamburger",
    "helper",
    "herbe",
    "hines",
    "hodgson",
    "hunt",
    "instruction",
    "interval",
    "italianstyle",
    "jim",
    "jimmy",
    "kellogg",
    "lagrille",
    "lake",
    "land",
    "laurentiis",
    "lawry",
    "lipton",
    "litre",
    "ll",
    "maid",
    "malt",
    "mate",
    "mayer",
    "meal",
    "medal",
    "medallion",
    "member",
    "mexicanstyle",
    "monte",
    "mori",
    "nest",
    "nu",
    "oounce",
    "oscar",
    "ox",
    "paso",
    "pasta",
    "patty",
    "petal",
    "pinche",
    "preserve",
    "quartere",
    "ranch",
    "ranchstyle",
    "rasher",
    "redhot",
    "resemble",
    "rice",
    "ro",
    "roni",
    "scissor",
    "scrap",
    "secret",
    "semicircle",
    "shard",
    "shear",
    "sixth",
    "sliver",
    "smucker",
    "snicker",
    "source",
    "spot",
    "state",
    "strand",
    "sun",
    "supreme",
    "tablepoon",
    "tail",
    "target",
    "tm",
    "tong",
    "toothpick",
    "triangle",
    "trimming",
    "tweezer",
    "valley",
    "vay",
    "wise",
    "wishbone",
    "wrapper",
    "yoplait",
    "ziploc",
}

flushtrated = flushtrated.union(STOP_WORDS)
flushtrated = flushtrated.union(additional_to_exclude)
flushtrated_list = list(flushtrated)

In [None]:
full_df = joblib.load(full_df_path)
full_df

In [None]:
full_df['prepSteps']

In [None]:
full_df['prepSteps'][0]

In [None]:
recipe_steps = full_df['prepSteps'].apply(" ".join).str.lower()

In [None]:
recipe_steps

In [None]:
# pre-calculating sentence mebeddings
embedding_model_params = {'embedding_model': 'all-MiniLM-L6-v2'}
embedding_model = SentenceTransformer(embedding_model_params['embedding_model'])
# embeddings = embedding_model.encode(recipe_steps, show_progress_bar=True)

In [None]:
# specify UMAP dimensionality reductions
umap_model_params = {'n_neighbors':15, 'n_components':10, 'random_state':200}
umap_model = UMAP(**umap_model_params)

In [None]:
# cluster with HDBSCAN
hdbscan_model_params = {'min_cluster_size':200, 'prediction_data':True}
hdbscan_model = HDBSCAN(**hdbscan_model_params)

In [None]:
# adding custom count vectorization
cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'preprocessor':custom_preprocessor,
    # 'tokenizer':custom_lemmatizer, # out of memory 
    'stop_words':flushtrated_list,
    'token_pattern':r"(?u)\b[a-zA-Z]{2,}\b",
    'ngram_range':(1, 4),
    'min_df':10,
}

vectorizer_model = CountVectorizer(**cv_params)

In [None]:
# with open('../data/processed/bertopic_params.joblib', 'w') as fp:
pipeline_params = {
    'embedding':{'pretrained_sentence_embeddings': embedding_model_params},
    'dimension_reduction': {'UMAP': umap_model_params},
    'clustering': {'HDBSCAN': hdbscan_model_params},
    'vectorizer': {'sklearn_countvectorizer': cv_params},
}
joblib.dump(pipeline_params, '../data/processed/bertopic_params.joblib')

In [None]:
with mlflow.start_run(experiment_id=get_experiment_id("initial_explicit_spec_run")):
    # mlflow.log_params(pipeline_params)
    
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        top_n_words=20,
        verbose=True
    )

    topics, probs = topic_model.fit_transform(recipe_steps)

    topic_model.get_topic_info().to_json('../data/processed/topic_model_df.json')

    mlflow.log_artifact('../data/processed/bertopic_params.joblib')
    mlflow.log_artifact('../data/processed/topic_model_df.json')

In [None]:
topics, probs = topic_model.fit_transform(recipe_steps)

In [None]:
topic_model.get_topic_info()

# | export
def foo():
    pass

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()