# Testing Stanza

# | default_exp core

In [None]:
# !spacy download en_core_web_sm

In [None]:
# | hide
from nbdev.showdoc import *
import project_path

# from bertopic import BERTopic
import dagshub
dagshub.init(repo_name='MeaLeon', repo_owner='AaronWChen')
# from datetime import datetime
# from hdbscan import HDBSCAN
from itertools import tee, islice
import joblib 
import json
# import matplotlib.pyplot as plt
import mlflow
from mlflow.models.signature import infer_signature
import numpy as np
import pandas as pd
import re
# from sklearn.base import TransformerMixin
# from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer,
)
from sklearn.pipeline import Pipeline
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
# import spacy
# import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
# from spacy.lemmatizer import Lemmatizer
import stanza
from tqdm import tqdm
from typing import Any
# from umap import UMAP

# import local scripts
# import src.nlp_processor as nlpp

stanza.download('en')
nlp = stanza.Pipeline('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-08 21:29:37 INFO: Downloading default packages for language: en (English) ...
2023-11-08 21:29:38 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2023-11-08 21:29:41 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2023-11-08 21:29:41 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-08 21:29:41 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-11-08 21:29:41 INFO: Using device: cuda
2023-11-08 21:29:41 INFO: Loading: tokenize
2023-11-08 21:29:44 INFO: Loading: pos
2023-11-08 21:29:45 INFO: Loading: lemma
2023-11-08 21:29:45 INFO: Loading: constituency
2023-11-08 21:29:45 INFO: Loading: depparse
2023-11-08 21:29:45 INFO: Loading: sentiment
2023-11-08 21:29:45 INFO: Loading: ner
2023-11-08 21:29:46 INFO: Done loading processors!


In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

DAGSHUB_REPO_NAME="MeaLeon"
BRANCH="STANZA-1/refactor-nltk-stanza"
dagshub.init(repo_name='MeaLeon', repo_owner='AaronWChen')


In [None]:
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
def stanza_preprocessor(stanza_pipeline, ingredients_list):
    # This function takes in a Stanza pipeline and a recipe's ingredients in list form and returns a Stanza transformed document to be used in the lemmatizer
    lowered = " brk ".join(ingredients_list).lower()
    # lowered = ingredients_list.apply(" brk ".join).str.lower()
    print(type(stanza_pipeline(lowered)))
    print(stanza_pipeline(lowered))
    return stanza_pipeline(lowered)

# print(*[f'word: {word.text+" "}\t \t lemma: {word.lemma}, \t \t upos: {word.upos}' for sent in doc2.sentences for word in sent.words], sep='\n')

# This will be the tokenizer?
# lemma_test_recipe_2 = " ".join([word.lemma for sent in doc2.sentences for word in sent.words if (
#     word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
# )])

In [None]:
def stanza_lemmatizer(stanza_preprocessed):
    # This function takes in the preprocessed Stanza document from preprocessor and performs lemmatization and filtering
    
    return " ".join([word.lemma 
                      for sent in stanza_preprocessed.sentences 
                      for word in sent.words if (
                          word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                          )
                    ])
            

In [None]:
# custom ngram analyzer function, matching only ngrams that belong to the same line
def gen_analyzer(minNgramLength, maxNgramLength):
    def ngrams_per_line(doc):

        # analyze each line of the input string seperately
        for ln in doc.split('brk'):

            # tokenize the input string (customize the regex as desired)
            terms = re.findall(u'(?u)\\b\\w+\\b', ln)

            # loop ngram creation for every number between min and max ngram length
            for ngramLength in range(minNgramLength, maxNgramLength+1):

                # find and return all ngrams
                # for ngram in zip(*[terms[i:] for i in range(3)]): <-- solution without a generator (works the same but has higher memory usage)
                for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]): # <-- solution using a generator
                    ngram = ' '.join(ngram)
                    yield ngram
    return ngrams_per_line

In [None]:
raw_data_path = '../data/recipes-en-201706/epicurious-recipes_m2.json'
food_stopwords_path = "../food_stopwords.csv"

joblib_basepath = '../joblib/2022.08.23/'

cv_path = joblib_basepath + 'countvec.joblib'
tfidf_path = joblib_basepath + 'tfidf.joblib'
full_df_path = joblib_basepath + 'recipes_with_cv.joblib'
reduced_df_path = joblib_basepath + 'reduced_df.joblib'

In [None]:
# this is a redeem for variable naming mixed with a free pun-ish me daddy, flushtrated will be the list of all stopword to exclude so named because we're throwing these words down the drain

flushtrated = {x for x in pd.read_csv(food_stopwords_path)}
additional_to_exclude = {
    "red",
    "green",
    "black",
    "yellow",
    "white",
    "inch",
    "mince",
    "chop",
    "fry",
    "trim",
    "flat",
    "beat",
    "brown",
    "golden",
    "balsamic",
    "halve",
    "blue",
    "divide",
    "trim",
    "unbleache",
    "granulate",
    "Frank",
    "alternative",
    "american",
    "annie",
    "asian",
    "balance",
    "band",
    "barrel",
    "bay",
    "bayou",
    "beam",
    "beard",
    "bell",
    "betty",
    "bird",
    "blast",
    "bob",
    "bone",
    "breyers",
    "calore",
    "carb",
    "card",
    "chachere",
    "change",
    "circle",
    "coffee",
    "coil",
    "country",
    "cow",
    "crack",
    "cracker",
    "crocker",
    "crystal",
    "dean",
    "degree",
    "deluxe",
    "direction",
    "duncan",
    "earth",
    "eggland",
    "ener",
    "envelope",
    "eye",
    "fantastic",
    "far",
    "fat",
    "feather",
    "flake",
    "foot",
    "fourth",
    "frank",
    "french",
    "fusion",
    "genoa",
    "genovese",
    "germain",
    "giada",
    "gold",
    "granule",
    "greek",
    "hamburger",
    "helper",
    "herbe",
    "hines",
    "hodgson",
    "hunt",
    "instruction",
    "interval",
    "italianstyle",
    "jim",
    "jimmy",
    "kellogg",
    "lagrille",
    "lake",
    "land",
    "laurentiis",
    "lawry",
    "lipton",
    "litre",
    "ll",
    "maid",
    "malt",
    "mate",
    "mayer",
    "meal",
    "medal",
    "medallion",
    "member",
    "mexicanstyle",
    "monte",
    "mori",
    "nest",
    "nu",
    "oounce",
    "oscar",
    "ox",
    "paso",
    "pasta",
    "patty",
    "petal",
    "pinche",
    "preserve",
    "quartere",
    "ranch",
    "ranchstyle",
    "rasher",
    "redhot",
    "resemble",
    "rice",
    "ro",
    "roni",
    "scissor",
    "scrap",
    "secret",
    "semicircle",
    "shard",
    "shear",
    "sixth",
    "sliver",
    "smucker",
    "snicker",
    "source",
    "spot",
    "state",
    "strand",
    "sun",
    "supreme",
    "tablepoon",
    "tail",
    "target",
    "tm",
    "tong",
    "toothpick",
    "triangle",
    "trimming",
    "tweezer",
    "valley",
    "vay",
    "wise",
    "wishbone",
    "wrapper",
    "yoplait",
    "ziploc",
}

flushtrated = flushtrated.union(STOP_WORDS)
flushtrated = flushtrated.union(additional_to_exclude)
flushtrated_list = list(flushtrated)

In [None]:
full_df = joblib.load(full_df_path)
full_df

Unnamed: 0,id,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,cuisine_name,photo_filename,...,zest pith,zest vegetable,zinfandel,ziti,zucchini,zucchini blossom,zucchini crookneck,zucchini squash,árbol,árbol pepper
0,54a2b6b019925f464b373351,How does fried chicken achieve No. 1 status? B...,Pickle-Brined Fried Chicken,3.11,"[1 tablespoons yellow mustard seeds, 1 tablesp...",[Toast mustard and coriander seeds in a dry me...,7,100,Missing Cuisine,51247610_fried-chicken_1x1.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,54a408a019925f464b3733bc,Spinaci all'Ebraica,Spinach Jewish Style,3.22,"[3 pounds small-leaved bulk spinach, Salt, 1/2...",[Remove the stems and roots from the spinach. ...,5,80,Italian,EP_12162015_placeholders_rustic.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,54a408a26529d92b2c003631,"This majestic, moist, and richly spiced honey ...",New Year’s Honey Cake,3.62,"[3 1/2 cups all-purpose flour, 1 tablespoon ba...",[I like this cake best baked in a 9-inch angel...,105,88,Kosher,EP_09022015_honeycake-2.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,54a408a66529d92b2c003638,The idea for this sandwich came to me when my ...,The B.L.A.Bagel with Lox and Avocado,4.00,"[1 small ripe avocado, preferably Hass (see No...","[A short time before serving, mash avocado and...",7,100,Kosher,EP_12162015_placeholders_casual.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,54a408a719925f464b3733cc,"In 1930, Simon Agranat, the chief justice of t...",Shakshuka a la Doktor Shakshuka,2.71,"[2 pounds fresh tomatoes, unpeeled and cut in ...","[1. Place the tomatoes, garlic, salt, paprika,...",7,83,Kosher,EP_12162015_placeholders_formal.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34751,59541a31bff3052847ae2107,Buttering the bread before you waffle it ensur...,Waffled Ham and Cheese Melt with Maple Butter,0.00,"[1 tablespoon unsalted butter, at room tempera...","[Preheat the waffle iron on low., Spread a thi...",0,0,Missing Cuisine,waffle-ham-and-cheese-melt-062817.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34752,5954233ad52ca90dc28200e7,"Spread this easy compound butter on waffles, p...",Maple Butter,0.00,"[8 tablespoons (1 stick) salted butter, at roo...",[Combine the ingredients in a medium-size bowl...,0,0,Missing Cuisine,EP_12162015_placeholders_bright.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34753,595424c2109c972493636f83,Leftover mac and cheese is not exactly one of ...,Waffled Macaroni and Cheese,0.00,"[3 tablespoons unsalted butter, plus more for ...",[Preheat the oven to 375°F. Butter a 9x5-inch ...,0,0,Missing Cuisine,waffle-mac-n-cheese-062816.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34754,5956638625dc3d1d829b7166,A classic Mexican beer cocktail you can sip al...,Classic Michelada,0.00,"[Coarse salt, 2 lime wedges, 2 ounces tomato j...",[Place about 1/4 cup salt on a small plate. Ru...,0,0,Missing Cuisine,Classic Michelada 07292017.jpg,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
full_df['ingredients'][0]

['1 tablespoons yellow mustard seeds',
 '1 tablespoons brown mustard seeds',
 '1 1/2 teaspoons coriander seeds',
 '1 cup apple cider vinegar',
 '2/3 cup kosher salt',
 '1/3 cup sugar',
 '1/4 cup chopped fresh dill',
 '8 skinless, boneless chicken thighs (about 3 pounds), halved, quartered if large',
 'Vegetable oil (for frying; about 10 cups)',
 '2 cups buttermilk',
 '2 cups all-purpose flour',
 'Kosher salt',
 'Honey, flaky sea salt (such as Maldon), toasted benne or sesame seeds, hot sauce (for serving)',
 'A deep-fry thermometer']

In [None]:
test_recipe = ". ".join(full_df['ingredients'][0]).lower()
test_recipe

'1 tablespoons yellow mustard seeds. 1 tablespoons brown mustard seeds. 1 1/2 teaspoons coriander seeds. 1 cup apple cider vinegar. 2/3 cup kosher salt. 1/3 cup sugar. 1/4 cup chopped fresh dill. 8 skinless, boneless chicken thighs (about 3 pounds), halved, quartered if large. vegetable oil (for frying; about 10 cups). 2 cups buttermilk. 2 cups all-purpose flour. kosher salt. honey, flaky sea salt (such as maldon), toasted benne or sesame seeds, hot sauce (for serving). a deep-fry thermometer'

In [None]:
doc = nlp(test_recipe)

In [None]:
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

id: (1,)	text: 1
id: (2,)	text: tablespoons
id: (3,)	text: yellow
id: (4,)	text: mustard
id: (5,)	text: seeds
id: (6,)	text: .
id: (1,)	text: 1
id: (2,)	text: tablespoons
id: (3,)	text: brown
id: (4,)	text: mustard
id: (5,)	text: seeds
id: (6,)	text: .
id: (1,)	text: 1
id: (2,)	text: 1/2
id: (3,)	text: teaspoons
id: (4,)	text: coriander
id: (5,)	text: seeds
id: (6,)	text: .
id: (1,)	text: 1
id: (2,)	text: cup
id: (3,)	text: apple
id: (4,)	text: cider
id: (5,)	text: vinegar
id: (6,)	text: .
id: (1,)	text: 2/3
id: (2,)	text: cup
id: (3,)	text: kosher
id: (4,)	text: salt
id: (5,)	text: .
id: (1,)	text: 1/3
id: (2,)	text: cup
id: (3,)	text: sugar
id: (4,)	text: .
id: (1,)	text: 1/4
id: (2,)	text: cup
id: (3,)	text: chopped
id: (4,)	text: fresh
id: (5,)	text: dill
id: (6,)	text: .
id: (1,)	text: 8
id: (2,)	text: skinless
id: (3,)	text: ,
id: (4,)	text: boneless
id: (5,)	text: chicken
id: (6,)	text: thighs
id: (7,)	text: (
id: (8,)	text: about
id: (9,)	text: 3
id: (10,)	text: pounds
id: (11,

In [None]:
print([sentence.text for sentence in doc.sentences])


['1 tablespoons yellow mustard seeds.', '1 tablespoons brown mustard seeds.', '1 1/2 teaspoons coriander seeds.', '1 cup apple cider vinegar.', '2/3 cup kosher salt.', '1/3 cup sugar.', '1/4 cup chopped fresh dill.', '8 skinless, boneless chicken thighs (about 3 pounds), halved, quartered if large.', 'vegetable oil (for frying; about 10 cups).', '2 cups buttermilk.', '2 cups all-purpose flour.', 'kosher salt.', 'honey, flaky sea salt (such as maldon), toasted benne or sesame seeds, hot sauce (for serving).', 'a deep-fry thermometer']


In [None]:
print(*[f'word: {word.text+" "}\t \t lemma: {word.lemma}, \t \t upos: {word.upos}' for sent in doc.sentences for word in sent.words], sep='\n')


word: 1 	 	 lemma: 1, 	 	 upos: NUM
word: tablespoons 	 	 lemma: tablespoon, 	 	 upos: NOUN
word: yellow 	 	 lemma: yellow, 	 	 upos: ADJ
word: mustard 	 	 lemma: mustard, 	 	 upos: NOUN
word: seeds 	 	 lemma: seed, 	 	 upos: NOUN
word: . 	 	 lemma: ., 	 	 upos: PUNCT
word: 1 	 	 lemma: 1, 	 	 upos: NUM
word: tablespoons 	 	 lemma: tablespoon, 	 	 upos: NOUN
word: brown 	 	 lemma: brown, 	 	 upos: ADJ
word: mustard 	 	 lemma: mustard, 	 	 upos: NOUN
word: seeds 	 	 lemma: seed, 	 	 upos: NOUN
word: . 	 	 lemma: ., 	 	 upos: PUNCT
word: 1 	 	 lemma: 1, 	 	 upos: NUM
word: 1/2 	 	 lemma: 1/2, 	 	 upos: NUM
word: teaspoons 	 	 lemma: teaspoon, 	 	 upos: NOUN
word: coriander 	 	 lemma: coriander, 	 	 upos: NOUN
word: seeds 	 	 lemma: seed, 	 	 upos: NOUN
word: . 	 	 lemma: ., 	 	 upos: PUNCT
word: 1 	 	 lemma: 1, 	 	 upos: NUM
word: cup 	 	 lemma: cup, 	 	 upos: NOUN
word: apple 	 	 lemma: apple, 	 	 upos: NOUN
word: cider 	 	 lemma: cider, 	 	 upos: NOUN
word: vinegar 	 	 lemma: vinegar, 

In [None]:
print(*[f'lemma: {word.lemma}' for sent in doc.sentences for word in sent.words if (
    word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "PUNCT", "SCONJ"]
)], sep='\n')


lemma: tablespoon
lemma: yellow
lemma: mustard
lemma: seed
lemma: tablespoon
lemma: brown
lemma: mustard
lemma: seed
lemma: teaspoon
lemma: coriander
lemma: seed
lemma: cup
lemma: apple
lemma: cider
lemma: vinegar
lemma: cup
lemma: kosher
lemma: salt
lemma: cup
lemma: sugar
lemma: cup
lemma: chop
lemma: fresh
lemma: dill
lemma: skinless
lemma: boneless
lemma: chicken
lemma: thigh
lemma: pound
lemma: halve
lemma: quarter
lemma: large
lemma: vegetable
lemma: oil
lemma: frying
lemma: cup
lemma: cup
lemma: buttermilk
lemma: cup
lemma: purpose
lemma: flour
lemma: kosher
lemma: salt
lemma: honey
lemma: flaky
lemma: sea
lemma: salt
lemma: such
lemma: maldon
lemma: toast
lemma: benne
lemma: sesame
lemma: seed
lemma: hot
lemma: sauce
lemma: serving
lemma: deep
lemma: fry
lemma: thermometer


In [None]:
[word.lemma for sent in doc.sentences for word in sent.words if (
    word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "PUNCT", "SCONJ"]
)]

['tablespoon',
 'yellow',
 'mustard',
 'seed',
 'tablespoon',
 'brown',
 'mustard',
 'seed',
 'teaspoon',
 'coriander',
 'seed',
 'cup',
 'apple',
 'cider',
 'vinegar',
 'cup',
 'kosher',
 'salt',
 'cup',
 'sugar',
 'cup',
 'chop',
 'fresh',
 'dill',
 'skinless',
 'boneless',
 'chicken',
 'thigh',
 'pound',
 'halve',
 'quarter',
 'large',
 'vegetable',
 'oil',
 'frying',
 'cup',
 'cup',
 'buttermilk',
 'cup',
 'purpose',
 'flour',
 'kosher',
 'salt',
 'honey',
 'flaky',
 'sea',
 'salt',
 'such',
 'maldon',
 'toast',
 'benne',
 'sesame',
 'seed',
 'hot',
 'sauce',
 'serving',
 'deep',
 'fry',
 'thermometer']

In [None]:
recipe = []
for sent in doc.sentences:
    ingredients = []
    for word in sent.words:
        if word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "PUNCT", "SCONJ"]:
            ingredients.append(word.lemma)
        else:
            pass
    recipe.append(ingredients)

recipe

[['tablespoon', 'yellow', 'mustard', 'seed'],
 ['tablespoon', 'brown', 'mustard', 'seed'],
 ['teaspoon', 'coriander', 'seed'],
 ['cup', 'apple', 'cider', 'vinegar'],
 ['cup', 'kosher', 'salt'],
 ['cup', 'sugar'],
 ['cup', 'chop', 'fresh', 'dill'],
 ['skinless',
  'boneless',
  'chicken',
  'thigh',
  'pound',
  'halve',
  'quarter',
  'large'],
 ['vegetable', 'oil', 'frying', 'cup'],
 ['cup', 'buttermilk'],
 ['cup', 'purpose', 'flour'],
 ['kosher', 'salt'],
 ['honey',
  'flaky',
  'sea',
  'salt',
  'such',
  'maldon',
  'toast',
  'benne',
  'sesame',
  'seed',
  'hot',
  'sauce',
  'serving'],
 ['deep', 'fry', 'thermometer']]

Found [this resource](https://stackoverflow.com/questions/26907309/create-ngrams-only-for-words-on-the-same-line-disregarding-line-breaks-with-sc), trying custom analyzer

In [None]:
# this is probably going to be the preprocessor
test_recipe_2 = " brk ".join(full_df['ingredients'][0]).lower()
doc2 = nlp(test_recipe_2)

# print(*[f'word: {word.text+" "}\t \t lemma: {word.lemma}, \t \t upos: {word.upos}' for sent in doc2.sentences for word in sent.words], sep='\n')

# This will be the tokenizer?
# lemma_test_recipe_2 = " ".join([word.lemma for sent in doc2.sentences for word in sent.words if (
#     word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
# )])
lemma_test_recipe_2 = [" ".join([word.lemma for sent in doc2.sentences for word in sent.words if (
    word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
)])]
lemma_test_recipe_2

['tablespoon yellow mustard seed brk tablespoon brown mustard seed brk teaspoon coriander seed brk cup apple cider vinegar brk cup kosher salt brk cup sugar brk cup chop fresh dill brk skinless , boneless chicken thigh ( pound ) , halve , quarter large brk vegetable oil ( frying ; cup ) brk cup buttermilk brk cup - purpose flour brk kosher salt brk honey , flaky sea salt ( such maldon ) , toast benne sesame seed , hot sauce ( serving ) brk deep - fry thermometer']

In [None]:
cv = CountVectorizer(analyzer=gen_analyzer(1, 4))
cv.fit(lemma_test_recipe_2)

In [None]:
X = cv.fit_transform(lemma_test_recipe_2)
cv.get_feature_names_out()

array(['apple', 'apple cider', 'apple cider vinegar', 'benne',
       'benne sesame', 'benne sesame seed', 'benne sesame seed hot',
       'boneless', 'boneless chicken', 'boneless chicken thigh',
       'boneless chicken thigh pound', 'brown', 'brown mustard',
       'brown mustard seed', 'buttermilk', 'chicken', 'chicken thigh',
       'chicken thigh pound', 'chicken thigh pound halve', 'chop',
       'chop fresh', 'chop fresh dill', 'cider', 'cider vinegar',
       'coriander', 'coriander seed', 'cup', 'cup apple',
       'cup apple cider', 'cup apple cider vinegar', 'cup buttermilk',
       'cup chop', 'cup chop fresh', 'cup chop fresh dill', 'cup kosher',
       'cup kosher salt', 'cup purpose', 'cup purpose flour', 'cup sugar',
       'deep', 'deep fry', 'deep fry thermometer', 'dill', 'flaky',
       'flaky sea', 'flaky sea salt', 'flaky sea salt such', 'flour',
       'fresh', 'fresh dill', 'fry', 'fry thermometer', 'frying',
       'frying cup', 'halve', 'halve quarter', 'halv

In [None]:
test_recipe_pipeline = full_df['ingredients'][0]

cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'preprocessor':stanza_preprocessor,
    'tokenizer':stanza_lemmatizer, # out of memory 
    # 'stop_words':flushtrated_list,
    'analyzer': gen_analyzer(1, 4),
    'min_df':10,
}

vectorizer_model = CountVectorizer(**cv_params)

test_transform = vectorizer_model.fit_transform(test_recipe_pipeline)



ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [None]:
test_recipe_pipeline = full_df['ingredients'][0]

test_recipe_preproc = stanza_preprocessor(nlp, test_recipe_pipeline)

<class 'stanza.models.common.doc.Document'>
[
  [
    {
      "id": 1,
      "text": "1",
      "lemma": "1",
      "upos": "NUM",
      "xpos": "CD",
      "feats": "NumForm=Digit|NumType=Card",
      "head": 2,
      "deprel": "nummod",
      "start_char": 0,
      "end_char": 1,
      "ner": "S-CARDINAL",
      "multi_ner": [
        "S-CARDINAL"
      ]
    },
    {
      "id": 2,
      "text": "tablespoons",
      "lemma": "tablespoon",
      "upos": "NOUN",
      "xpos": "NNS",
      "feats": "Number=Plur",
      "head": 3,
      "deprel": "obl:npmod",
      "start_char": 2,
      "end_char": 13,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "yellow",
      "lemma": "yellow",
      "upos": "ADJ",
      "xpos": "JJ",
      "feats": "Degree=Pos",
      "head": 5,
      "deprel": "amod",
      "start_char": 14,
      "end_char": 20,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "tex

In [None]:
test_recipe_lemma = stanza_lemmatizer(test_recipe_preproc)
test_recipe_lemma

'tablespoon yellow mustard seed brk tablespoon brown mustard seed brk teaspoon coriander seed brk cup apple cider vinegar brk cup kosher salt brk cup sugar brk cup chop fresh dill brk skinless , boneless chicken thigh ( pound ) , halve , quarter large brk vegetable oil ( frying ; cup ) brk cup buttermilk brk cup - purpose flour brk kosher salt brk honey , flaky sea salt ( such maldon ) , toast benne sesame seed , hot sauce ( serving ) brk deep - fry thermometer'

In [None]:
for ln in test_recipe_lemma.split('brk'):

    # tokenize the input string (customize the regex as desired)
    terms = re.findall(u'(?u)\\b\\w+\\b', ln)

    # loop ngram creation for every number between min and max ngram length
    for ngramLength in range(1, 5):

        # find and return all ngrams
        for ngram in zip(*[terms[i:] for i in range(3)]): #<-- solution without a generator (works the same but has higher memory usage)
        # for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]): # <-- solution using a generator
            ngram = ' '.join(ngram)
            # yield ngram
ngram


'yellow mustard seed'

In [None]:
test_recipe_lemma

'tablespoon yellow mustard seed brk tablespoon brown mustard seed brk teaspoon coriander seed brk cup apple cider vinegar brk cup kosher salt brk cup sugar brk cup chop fresh dill brk skinless , boneless chicken thigh ( pound ) , halve , quarter large brk vegetable oil ( frying ; cup ) brk cup buttermilk brk cup - purpose flour brk kosher salt brk honey , flaky sea salt ( such maldon ) , toast benne sesame seed , hot sauce ( serving ) brk deep - fry thermometer'

In [None]:
test_recipe_lemma.split(' brk ')

['tablespoon yellow mustard seed',
 'tablespoon brown mustard seed',
 'teaspoon coriander seed',
 'cup apple cider vinegar',
 'cup kosher salt',
 'cup sugar',
 'cup chop fresh dill',
 'skinless , boneless chicken thigh ( pound ) , halve , quarter large',
 'vegetable oil ( frying ; cup )',
 'cup buttermilk',
 'cup - purpose flour',
 'kosher salt',
 'honey , flaky sea salt ( such maldon ) , toast benne sesame seed , hot sauce ( serving )',
 'deep - fry thermometer']

In [None]:
gen_analyzer_tester(1,4)

<function __main__.gen_analyzer_tester.<locals>.ngrams_per_line(doc=['tablespoon yellow mustard seed brk tablespoon brown mustard seed brk teaspoon coriander seed brk cup apple cider vinegar brk cup kosher salt brk cup sugar brk cup chop fresh dill brk skinless , boneless chicken thigh ( pound ) , halve , quarter large brk vegetable oil ( frying ; cup ) brk cup buttermilk brk cup - purpose flour brk kosher salt brk honey , flaky sea salt ( such maldon ) , toast benne sesame seed , hot sauce ( serving ) brk deep - fry thermometer'])>

In [None]:
temp = full_df["ingredients"][0:50].apply("|".join)


cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'preprocessor':stanza_preprocessor,
    'tokenizer':stanza_lemmatizer, # out of memory 
    # 'stop_words':flushtrated_list,
    'analyzer': gen_analyzer(1, 4),
    'min_df':10,
}

vectorizer_model = CountVectorizer(**cv_params)

test_transform = vectorizer_model.fit_transform(temp)#:5])
vectorizer_model.get_feature_names_out()



array(['1', '1 1', '1 1 2', '1 1 2 cups', '1 2', '1 2 cup', '1 2 cups',
       '1 2 teaspoon', '1 3', '1 4', '1 4 cup', '1 4 teaspoon', '1 cup',
       '1 large', '1 tablespoon', '1 teaspoon', '2', '2 cup', '2 cups',
       '2 inch', '2 tablespoons', '2 teaspoon', '2 teaspoons', '3', '3 4',
       '3 4 cup', '3 cups', '4', '4 cup', '4 teaspoon', '5', '6', '8',
       'about', 'all', 'all purpose', 'all purpose flour', 'and', 'black',
       'butter', 'chopped', 'chopped fresh', 'coarse', 'coarse kosher',
       'coarse kosher salt', 'cream', 'cup', 'cups', 'cut', 'cut into',
       'cut into 1', 'divided', 'dried', 'egg', 'extra', 'extra virgin',
       'extra virgin olive', 'extra virgin olive oil', 'finely', 'flour',
       'fresh', 'garlic', 'ground', 'halved', 'inch', 'into', 'into 1',
       'juice', 'kosher', 'kosher salt', 'large', 'lemon', 'of', 'oil',
       'olive', 'olive oil', 'onion', 'or', 'ounce', 'ounces', 'packed',
       'peeled', 'pound', 'pounds', 'purpose', 'purpos

based on the warnings (preprocessor not used since analyzer is callable and tokenizer not used since analyzer is not word), may need to move preprocessing into analyzer. And based on [this](https://stackoverflow.com/questions/63185843/scikit-learn-countvectorizer-customizing-preprocessor-tokenizer-and-analyzer), will need to incorporate stopwords into the analyzer

In [None]:
# custom ngram analyzer function, matching only ngrams that belong to the same line
def gen_analyzer_2(stanza_pipeline, minNgramLength, maxNgramLength):
    def ngrams_per_line(ingredients_list):

        lowered = " brk ".join(ingredients_list).lower()
        preproc = stanza_pipeline(lowered)
        
        lemmad = " ".join([word.lemma 
                      for sent in preproc.sentences 
                      for word in sent.words if (
                          word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                        #   and word not in STOP_WORDS
                          )
                    ])

        # analyze each line of the input string seperately
        for ln in lemmad.split(' brk '):

            # tokenize the input string (customize the regex as desired)
            # terms = re.findall(u'(?u)\\b\\w+\\b', ln)
            # terms = re.findall("(?u)\b[a-zA-Z]{2,}\b", ln)

            # loop ngram creation for every number between min and max ngram length
            for ngramLength in range(minNgramLength, maxNgramLength+1):

                # find and return all ngrams
                # for ngram in zip(*[terms[i:] for i in range(3)]): <-- solution without a generator (works the same but has higher memory usage)
                # for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]): # <-- solution using a generator
                for ngram in zip(*[islice(seq, i, len(ln)) for i, seq in enumerate(tee(ln, ngramLength))]): # <-- solution using a generator
                    ngram = ' '.join(ngram)
                    yield ngram
                    
    return ngrams_per_line

In [None]:
temp = full_df["ingredients"][0:500]#.apply("|".join)

cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    'analyzer': gen_analyzer_2(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':10,
}

vectorizer_model = CountVectorizer(**cv_params)

# test_transform = vectorizer_model.fit_transform(tqdm(temp))
test_transform = vectorizer_model.fit_transform(temp)
vectorizer_model.get_feature_names_out()

TypeError: sequence item 37: expected str instance, NoneType found

In [None]:
# was getting an error (TypeError: sequence item 37: expected str instance, NoneType found) saying that some recipes have NoneType, and wondered if there were recipes with no ingredients for some reason
full_df[full_df["ingredients"].isna()]

Unnamed: 0,id,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,cuisine_name,photo_filename,...,zest pith,zest vegetable,zinfandel,ziti,zucchini,zucchini blossom,zucchini crookneck,zucchini squash,árbol,árbol pepper


In [None]:
subset = full_df['ingredients'][0:5].apply(" brk ".join).str.lower()
subset


0    1 tablespoons yellow mustard seeds brk 1 table...
1    3 pounds small-leaved bulk spinach brk salt br...
2    3 1/2 cups all-purpose flour brk 1 tablespoon ...
3    1 small ripe avocado, preferably hass (see not...
4    2 pounds fresh tomatoes, unpeeled and cut in q...
Name: ingredients, dtype: object

In [None]:
test_preproc = full_df['ingredients'][0:5].apply(stanza_preprocessor, args=(nlp,))
test_preproc

AttributeError: 'Pipeline' object has no attribute 'apply'

In [None]:
preproc = stanza_preprocessor(nlp, full_df['ingredients'][0:5][0])
preproc

[
  [
    {
      "id": 1,
      "text": "1",
      "lemma": "1",
      "upos": "NUM",
      "xpos": "CD",
      "feats": "NumForm=Digit|NumType=Card",
      "head": 2,
      "deprel": "nummod",
      "start_char": 0,
      "end_char": 1,
      "ner": "S-CARDINAL",
      "multi_ner": [
        "S-CARDINAL"
      ]
    },
    {
      "id": 2,
      "text": "tablespoons",
      "lemma": "tablespoon",
      "upos": "NOUN",
      "xpos": "NNS",
      "feats": "Number=Plur",
      "head": 3,
      "deprel": "obl:npmod",
      "start_char": 2,
      "end_char": 13,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "yellow",
      "lemma": "yellow",
      "upos": "ADJ",
      "xpos": "JJ",
      "feats": "Degree=Pos",
      "head": 5,
      "deprel": "amod",
      "start_char": 14,
      "end_char": 20,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "mustard",
      "lemma": "mustard",
   

In [None]:
lemmad = stanza_lemmatizer(preproc)
lemmad

'tablespoon yellow mustard seed brk tablespoon brown mustard seed brk teaspoon coriander seed brk cup apple cider vinegar brk cup kosher salt brk cup sugar brk cup chop fresh dill brk skinless , boneless chicken thigh ( pound ) , halve , quarter large brk vegetable oil ( frying ; cup ) brk cup buttermilk brk cup - purpose flour brk kosher salt brk honey , flaky sea salt ( such maldon ) , toast benne sesame seed , hot sauce ( serving ) brk deep - fry thermometer'

In [None]:
full_df['prepSteps']

In [None]:
full_df['prepSteps'][0]

In [None]:
recipe_steps = full_df['prepSteps'].apply(" ".join).str.lower()

In [None]:
recipe_steps

In [None]:
# pre-calculating sentence mebeddings
# embedding_model_params = {'embedding_model': 'all-MiniLM-L6-v2'}
# embedding_model = SentenceTransformer(embedding_model_params['embedding_model'])
# embeddings = embedding_model.encode(recipe_steps, show_progress_bar=True)

In [None]:
# specify UMAP dimensionality reductions
# umap_model_params = {'n_neighbors':15, 'n_components':10, 'random_state':200}
# umap_model = UMAP(**umap_model_params)

In [None]:
# cluster with HDBSCAN
# hdbscan_model_params = {'min_cluster_size':200, 'prediction_data':True}
# hdbscan_model = HDBSCAN(**hdbscan_model_params)

In [None]:
# adding custom count vectorization
cv_params = {
    'strip_accents':"unicode",
    'lowercase':True,
    # 'preprocessor':custom_preprocessor,
    # 'tokenizer':custom_lemmatizer, # out of memory 
    'stop_words':flushtrated_list,
    'token_pattern':r"(?u)\b[a-zA-Z]{2,}\b",
    'ngram_range':(1, 4),
    'min_df':10,
}

vectorizer_model = CountVectorizer(**cv_params)



AttributeError: 'list' object has no attribute 'lower'

In [None]:
# # with open('../data/processed/bertopic_params.joblib', 'w') as fp:
# pipeline_params = {
#     'embedding':{'pretrained_sentence_embeddings': embedding_model_params},
#     'dimension_reduction': {'UMAP': umap_model_params},
#     'clustering': {'HDBSCAN': hdbscan_model_params},
#     'vectorizer': {'sklearn_countvectorizer': cv_params},
# }
# joblib.dump(pipeline_params, '../data/processed/bertopic_params.joblib')

In [None]:
with mlflow.start_run(experiment_id=get_experiment_id("initial_explicit_spec_run_3")):
    pipeline_params = {
        'language':'english',
        'top_n_words':20,
        'n_gram_range':(1, 4),
        'min_topic_size':500,
        'nr_topics':'auto',
        'verbose':True,
        'low_memory':True,
        'calculate_probabilities':True
    }
    mlflow.log_params(pipeline_params)
    
    topic_model = BERTopic(
        **pipeline_params,
        vectorizer_model=vectorizer_model
    )
    # TOKENIZERS_PARALLELISM=False

    topics, probs = topic_model.fit_transform(recipe_steps)

    topic_model.get_topic_info().to_json('../data/processed/topic_model_df.json')

    # mlflow.log_artifact('../data/processed/bertopic_params.joblib')
    mlflow.log_artifact('../data/processed/topic_model_df.json')

    print(topic_model.get_topic_info())

In [None]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [None]:
print_full(topic_model.get_topic_info())

In [None]:
topic_model.get_topic_info()['Representation']

# | export
def foo():
    pass

In [None]:
# | hide
# import nbdev

# nbdev.nbdev_export()