# | default_exp core

In [None]:
# | hide
# from bertopic import BERTopic
# from bertopic.vectorizers import OnlineCountVectorizer
import dagshub
from datetime import datetime
import dill as pickle
import dvc.api
# from hdbscan import HDBSCAN
from itertools import tee, islice, product
import joblib
# import mlflow
# from mlflow.models import infer_signature
import nbdev
from nbdev.showdoc import *
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import (
    CountVectorizer
    , TfidfTransformer
    , TfidfVectorizer
    , 
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
# from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer
# from src.custom_stanza_mlflow import CustomSKLearnWrapper
import src.dataframe_preprocessor as dfpp
import stanza
from tqdm import tqdm
# from umap import UMAP

In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'

# | export

In [None]:
def stanza_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # print("Stanza processing start: " + str(datetime.now()))
            
            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # print("Stanza processing end/lemmatizing start: " + str(datetime.now()))
            
            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            
            # print("Stanza lemmatizing end: " + str(datetime.now()))

            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):

                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngramLength in range(minNgramLength, maxNgramLength + 1):

                    # find and return all ngrams
                    # for ngram in zip(*[terms[i:] for i in range(3)]):
                    # <-- solution without a generator (works the same but has higher memory usage)
                    for ngram in zip(
                        *[
                            islice(seq, i, len(terms))
                            for i, seq in enumerate(tee(terms, ngramLength))
                        ]
                    ):  # <-- solution using a generator

                        ngram = " ".join(map(str, ngram))
                        yield ngram

        # print("n_gram creation end: " + str(datetime.now()))
        return ngrams_per_line

In [None]:
def groq_mixtral_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):

                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngram_length in range(minNgramLength, maxNgramLength + 1):
                    ngrams = zip(*[seq[i:] for i in range(ngram_length)])
                    for ngram in map(" ".join, ngrams):
                        yield ngram

        # print("n_gram creation end: " + str(datetime.now()))
        return ngrams_per_line

In [None]:
def groq_llama2_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):
                terms = re.split(r"\b[a-zA-Z]{2,}\b", ln)
                for ngram_length in range(minNgramLength, maxNgramLength + 1):
                    for ngram in product(*[terms[i:] for i in range(ngram_length)]):
                        yield " ".join(map(str, ngram))

        # print("n_gram creation end: " + str(datetime.now()))
        return ngrams_per_line

In [None]:
def claude_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):
                at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                for ngram_length in range(minNgramLength, maxNgramLength + 1):
                    # Use itertools.tee to create multiple independent iterators
                    iterators = tee(terms, ngram_length)

                    # Use itertools.islice to slice each iterator
                    for i, it in enumerate(iterators):
                        iterators[i] = islice(it, i, len(terms))

                    # Yield n-grams of the desired length
                    yield from (" ".join(ngram) for ngram in zip(*iterators))
                    
        return ngrams_per_line

In [None]:
def phind70b_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, lemmad)

                # Loop n-gram creation for every number between min and max n-gram length
                for ngramLength in range(minNgramLength, maxNgramLength + 1):
                    # Use a sliding window approach to generate n-grams
                    for i in range(len(terms) - ngramLength + 1):
                        ngram = " ".join(terms[i:i+ngramLength])
                        yield ngram
                    
        return ngrams_per_line

In [None]:
def generate_ngrams(lemmad, minNgramLength, maxNgramLength):
    # Tokenize the input string
    at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
    terms = re.split(at_least_two_english_characters_whole_words, lemmad)
    print(terms)

    # Loop n-gram creation for every number between min and max n-gram length
    for ngramLength in range(minNgramLength, maxNgramLength + 1):
        # Use a sliding window approach to generate n-grams
        for i in range(len(terms) - ngramLength + 1):
            ngram = " ".join(terms[i:i+ngramLength])
            print(ngram)
            yield ngram


In [None]:
# Example usage
lemmad = "This is a test sentence for n-gram generation."
minNgramLength = 2
maxNgramLength = 3

for ngram in generate_ngrams(lemmad, minNgramLength, maxNgramLength):
    print(ngram)

# didn't print anything

['This is a test sentence for n-gram generation.']


In [None]:
# The print statements will be executed whenever the generator is iterated over. If you just want to see what the iterator prints, you can call it in a list comprehension without saving the result.

lemmad = "This is a test sentence for n-gram generation."
minNgramLength = 1
maxNgramLength = 3

[ngram for ngram in generate_ngrams(lemmad, minNgramLength, maxNgramLength)]
    

['This is a test sentence for n-gram generation.']
This is a test sentence for n-gram generation.


['This is a test sentence for n-gram generation.']

In [None]:
def stanza_analyzer_no_lemma(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            
            # Time O(n)
            # preproc = stanza_pipeline(lowered)

            # print("Stanza processing end/lemmatizing start: " + str(datetime.now()))
            
            # Suspicion that this is O(n^2)
            # lemmad = " ".join(
            #      map(
            #           str, 
            #             [
            #                word.lemma
            #                for sent in preproc.sentences
            #                for word in sent.words
            #                if (
            #                     word.upos
            #                     not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
            #                     and word is not None
            #                     )
            #             ],
            #      )
            # )
                        
            
            # print("Stanza lemmatizing end: " + str(datetime.now()))

            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lowered.split(" brk "):

                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngramLength in range(minNgramLength, maxNgramLength + 1):

                    # find and return all ngrams
                    # for ngram in zip(*[terms[i:] for i in range(3)]):
                    # <-- solution without a generator (works the same but has higher memory usage)
                    for ngram in zip(
                        *[
                            islice(seq, i, len(terms))
                            for i, seq in enumerate(tee(terms, ngramLength))
                        ]
                    ):  # <-- solution using a generator

                        ngram = " ".join(map(str, ngram))
                        yield ngram

        return ngrams_per_line

In [None]:
# def custom_analyzer(word_list, stanza_pipeline, minNgramLength, maxNgramLength, lemmatize=True):
#     lowered = " brk ".join([word.lower() for word in word_list if word is not None])

#     preproc = stanza_pipeline(lowered)
    
#     if lemmatize:
#         lemmad = " ".join(map(str,
#                             [word.lemma
#                             for sent in preproc.sentences 
#                             for word in sent.words if (
#                                 word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ", "PUNCT"]
#                                 and word is not None
#                             )]
#                         )
#                     )
#     else:
#         lemmad = " ".join(map(str,
#                             [word.text
#                             for sent in preproc.sentences 
#                             for word in sent.words if (
#                                 word is not None
#                             )]
#                         )
#                     )
#     # analyze each line of the input string seperately
#     for ln in lemmad.split(' brk '):
#         # tokenize the input string (customize the regex as desired)
#         at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
#         terms = re.split(at_least_two_english_characters_whole_words, ln)

#         # loop ngram creation for every number between min and max ngram length
#         for ngramLength in range(minNgramLength, maxNgramLength+1):

#             # find and return all ngrams
#             # for ngram in zip(*[terms[i:] for i in range(3)]): 
#                 # <-- solution without a generator (works the same but has higher memory usage)
#             for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]):   # <-- solution using a generator
                
#                 ngram = ' '.join(map(str, ngram))
#                 # yield ngram
#                 return str(ngram)


In [None]:
# new groq/mixtral suggestion by using entire function with external/internal
import more_itertools

def groq_mixtral_itertools_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):

    def ngrams_per_line(ingredients_list):

        lowered = " brk ".join(
                map(str, [ingred for ingred in ingredients_list if ingred is not None])
            ).lower()

        if lowered is None:
            lowered = "Missing ingredients"

        preproc = stanza_pipeline(lowered)

        lemmad = " ".join(
            map(
                str,
                [
                    word.lemma
                    for sent in preproc.sentences
                    for word in sent.words
                    if (
                        word.upos
                        not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                        and word is not None
                    )
                ],
            )
        )

        for ln in lemmad.split(" brk "):

            at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
            terms = [
                term for term in re.split(at_least_two_english_characters_whole_words, ln) if term
            ]

            for i in range(len(terms) - maxNgramLength + 1):
                ngram = terms[i : i + maxNgramLength]
                ngram = " ".join(ngram)
                yield ngram

    return ngrams_per_line

In [None]:
# | hide
# nbdev.nbdev_export()

### Data Preparation

In [None]:
# instantiate stanza pipeline
stanza.download('en')
nlp = stanza.Pipeline('en', 
                    depparse_batch_size=50, 
                    depparse_min_length_to_batch_separately=50,
                    verbose=True,
                    use_gpu=False, # set to true when on cloud/not on streaming computer
                    batch_size=100
                    )

# load raw data and preprocess/clean
data = dvc.api.read(
    path='../data/recipes-en-201706/epicurious-recipes_m2.json'
    , mode='r')
raw_df = pd.read_json(data)
print('\n')
print('--------------')
print('Raw Dataframe:', end='\n')
print(raw_df.head())
print(raw_df.shape)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-03-14 21:35:14 INFO: Downloading default packages for language: en (English) ...


2024-03-14 21:35:15 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2024-03-14 21:35:18 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2024-03-14 21:35:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-03-14 21:35:19 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2024-03-14 21:35:19 INFO: Using device: cpu
2024-03-14 21:35:19 INFO: Loading: tokenize
2024-03-14 21:35:19 INFO: Loading: pos
2024-03-14 21:35:19 INFO: Loading: lemma
2024-03-14 21:35:19 INFO: Loading: constituency
2024-03-14 21:35:19 INFO: Loading: depparse
2024-03-14 21:35:20 INFO: Loading: sentiment
2024-03-14 21:35:20 INFO: Loading: ner
2024-03-14 21:35:20 INFO: Done loading processors!




--------------
Raw Dataframe:
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000Z   

                       

In [None]:
# take sample and train/test split 
subset_df = raw_df.sample(n=100, random_state=45)
train_df, test_df = train_test_split(subset_df,test_size=0.5, random_state=45)

# pre_proc_df is cleaned dataframe
to_nlp_df = dfpp.preprocess_dataframe(train_df)
print('\n')
print('--------------')
print('Preprocessed Dataframe:', end='\n')
print(to_nlp_df.head())
print(to_nlp_df.shape)

# create subset for dev purposes
# to_nlp_df = pre_proc_df
# print('\n')
# print('-' * 80)
# print('Subset Dataframe:', end='\n')
# print(to_nlp_df.head())
# print(to_nlp_df.shape)



--------------
Preprocessed Dataframe:
                                                                        dek  \
id                                                                            
54a4270b19925f464b37c1dc                                                      
54a42cde19925f464b3809d2  Green chiles pickled in soy sauce and vinegar ...   
54a433036529d92b2c015de3  This soup features the flavors of India: aroma...   
54a451926529d92b2c01eda8                                                      
54a430876529d92b2c013e2b  Brown sugar and molasses are balanced by fresh...   

                                                                        hed  \
id                                                                            
54a4270b19925f464b37c1dc  Grilled Hearts of Romaine with Blue Cheese Vin...   
54a42cde19925f464b3809d2                              Soy-Pickled Jalapeños   
54a433036529d92b2c015de3  Curried Potato and Spinach Soup with Onion Sal...   
54a4519265

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=1
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


fit_transform start: 2024-03-14 21:35:22.742604


100%|██████████| 50/50 [01:30<00:00,  1.80s/it]


fit_transform end: 2024-03-14 21:36:52.876082


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer_no_lemma(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':1,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


fit_transform start: 2024-03-14 21:36:52.913984


100%|██████████| 50/50 [00:00<00:00, 12807.04it/s]


fit_transform end: 2024-03-14 21:36:52.925909


In [None]:
transformed_recipe

Unnamed: 0_level_0,Unnamed: 1_level_0,"*available at natural foods stores, juice bars, and some specialty foods shops",*available online and from suppliers of british products.,*pink pickled ginger slices are available at asian markets and also at some specialty foods stores.,"1 (12-inch-long) loaf of italian bread (about 2 to 3 inches in diameter), cut into 20 slices, about 1/2 inch thick","1 (7-ounce) jar roasted red bell peppers, drained, patted dry, and cut into thin strips",1 (8)-ounce package halloumi cheese,"1 1-pound unpeeled eggplant, cut into 3/4-inch cubes","1 1/2 cups (6 oz) pecans, toasted and chopped",1 1/2 cups chopped onion,...,salt,salt and freshly ground black pepper,"salt and freshly ground black pepper, to taste",salt to rub inside chicken,"special equipment: 2 muffin tins, each with 12 (1/2-cup) muffin cups",special equipment: a japanese benriner or other adjustable-blade slicer; a nonstick bakeware liner such as silpat,"special equipment: large ceramic heatproof bowl or 8 (8-ounce) ramekins, parchment paper","unsalted butter, softened, for spreading on the biscuits",vanilla or espresso whipped buttercream,vegetable oil for frying
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54a4270b19925f464b37c1dc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a42cde19925f464b3809d2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a433036529d92b2c015de3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a451926529d92b2c01eda8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.301511,0.0,0.0,0.0,0.0,0.0,0.0
54a430876529d92b2c013e2b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a453df6529d92b2c020687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.270808,0.0,...,0.0,0.0,0.0,0.0,0.270808,0.0,0.0,0.0,0.0,0.0
55b0e7116284773353bf4580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a42bab6529d92b2c00ffa7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0
54a4748f19925f464b399ef2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a4356a19925f464b3875bb,0.399821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.567569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(transformed_recipe.columns)

Index(['',
       '*available at natural foods stores, juice bars, and some specialty foods shops',
       '*available online and from suppliers of british products.',
       '*pink pickled ginger slices are available at asian markets and also at some specialty foods stores.',
       '1 (12-inch-long) loaf of italian bread (about 2 to 3 inches in diameter), cut into 20 slices, about 1/2 inch thick',
       '1 (7-ounce) jar roasted red bell peppers, drained, patted dry, and cut into thin strips',
       '1 (8)-ounce package halloumi cheese',
       '1 1-pound unpeeled eggplant, cut into 3/4-inch cubes',
       '1 1/2 cups (6 oz) pecans, toasted and chopped',
       '1 1/2 cups chopped onion',
       ...
       'salt', 'salt and freshly ground black pepper',
       'salt and freshly ground black pepper, to taste',
       'salt to rub inside chicken',
       'special equipment: 2 muffin tins, each with 12 (1/2-cup) muffin cups',
       'special equipment: a japanese benriner or other adju

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': 'word',
    'ngram_range': (1,1),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients'].apply(" ".join).str.lower()

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

print("sklearn fit transform start: " + str(datetime.now()))
# Do fit transform on data
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("sklearn fit transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


sklearn fit transform start: 2024-03-14 21:36:53.157714


100%|██████████| 50/50 [00:00<00:00, 29094.78it/s]


sklearn fit transform end: 2024-03-14 21:36:53.167581


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=2
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


fit_transform start: 2024-03-14 21:36:53.217712


100%|██████████| 50/50 [01:27<00:00,  1.74s/it]


fit_transform end: 2024-03-14 21:38:20.419104


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer_no_lemma(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=2
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


fit_transform start: 2024-03-14 21:38:20.459487


100%|██████████| 50/50 [00:00<00:00, 11706.78it/s]


fit_transform end: 2024-03-14 21:38:20.474049


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': 'word',
    'ngram_range': (1,2),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients'].apply(" ".join).str.lower()

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

print("sklearn fit transform start: " + str(datetime.now()))

# Do fit transform on data
response = sklearn_transformer.fit_transform(tqdm(model_input)) 

print("sklearn fit transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


sklearn fit transform start: 2024-03-14 21:38:20.585796


100%|██████████| 50/50 [00:00<00:00, 18833.88it/s]


sklearn fit transform end: 2024-03-14 21:38:20.596450


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=3
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


fit_transform start: 2024-03-14 21:38:20.642642


100%|██████████| 50/50 [01:26<00:00,  1.72s/it]


fit_transform end: 2024-03-14 21:39:46.863232


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer_no_lemma(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=3
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)

print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 21:39:46.886571


100%|██████████| 50/50 [00:00<00:00, 15050.61it/s]


fit_transform end: 2024-03-14 21:39:46.896780
                                    1 teaspoon vanilla extract  \
id                                                               
54a4270b19925f464b37c1dc  0.000000                    0.000000   
54a42cde19925f464b3809d2  0.000000                    0.000000   
54a433036529d92b2c015de3  0.000000                    0.000000   
54a451926529d92b2c01eda8  0.000000                    0.000000   
54a430876529d92b2c013e2b  0.000000                    0.000000   
54a453df6529d92b2c020687  0.000000                    0.000000   
55b0e7116284773353bf4580  0.000000                    0.000000   
54a42bab6529d92b2c00ffa7  0.000000                    0.000000   
54a4748f19925f464b399ef2  0.000000                    0.000000   
54a4356a19925f464b3875bb  0.873865                    0.000000   
54a4697e6529d92b2c0279d3  0.000000                    0.000000   
54a45e426529d92b2c02488f  0.000000                    0.000000   
54a452c96529d92b2c01f889  0.00

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': 'word',
    'ngram_range': (1,4),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients'].apply(" ".join).str.lower()

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

print("sklearn fit transform start: " + str(datetime.now()))

# Do fit transform on data
response = sklearn_transformer.fit_transform(tqdm(model_input)) 

print("sklearn fit transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)

print(transformed_recipe)
print(transformed_recipe.columns)

sklearn fit transform start: 2024-03-14 21:39:46.939294


100%|██████████| 50/50 [00:00<00:00, 9618.64it/s]


sklearn fit transform end: 2024-03-14 21:39:46.955361
                                12     about  about ounces  about pound  \
id                                                                        
54a4270b19925f464b37c1dc  0.000000  0.000000      0.000000     0.000000   
54a42cde19925f464b3809d2  0.000000  0.000000      0.000000     0.000000   
54a433036529d92b2c015de3  0.000000  0.130982      0.000000     0.000000   
54a451926529d92b2c01eda8  0.000000  0.000000      0.000000     0.000000   
54a430876529d92b2c013e2b  0.000000  0.000000      0.000000     0.000000   
54a453df6529d92b2c020687  0.105507  0.000000      0.000000     0.000000   
55b0e7116284773353bf4580  0.000000  0.000000      0.000000     0.000000   
54a42bab6529d92b2c00ffa7  0.000000  0.000000      0.000000     0.000000   
54a4748f19925f464b399ef2  0.000000  0.081576      0.000000     0.000000   
54a4356a19925f464b3875bb  0.000000  0.000000      0.000000     0.000000   
54a4697e6529d92b2c0279d3  0.000000  0.000000  

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': groq_llama2_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 21:39:47.059138


100%|██████████| 50/50 [01:30<00:00,  1.82s/it]


fit_transform end: 2024-03-14 21:41:17.884159
                                                                            \
id                                                                           
54a4270b19925f464b37c1dc  0.020953  0.047372  0.092922  0.192221  0.284231   
54a42cde19925f464b3809d2  0.019839  0.045629  0.097211  0.202357  0.355116   
54a433036529d92b2c015de3  0.013378  0.031620  0.060201  0.128002  0.192155   
54a451926529d92b2c01eda8  0.000395  0.001087  0.001838  0.006957  0.008499   
54a430876529d92b2c013e2b  0.004496  0.010598  0.022802  0.054597  0.109997   
54a453df6529d92b2c020687  0.025920  0.052767  0.106459  0.203661  0.335115   
55b0e7116284773353bf4580  0.030114  0.067756  0.143040  0.289845  0.466763   
54a42bab6529d92b2c00ffa7  0.020101  0.048243  0.104528  0.225136  0.389968   
54a4748f19925f464b399ef2  0.019672  0.048713  0.103982  0.226701  0.393447   
54a4356a19925f464b3875bb  0.025478  0.053787  0.110405  0.215149  0.362356   
54a4697e6529d92b2c

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': claude_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 21:41:17.924365


  0%|          | 0/50 [00:01<?, ?it/s]


TypeError: 'tuple' object does not support item assignment

Claude code could maybe be fixable if I can find why it's trying to reassign a tuple

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': phind70b_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':1,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 21:41:44.316464


100%|██████████| 50/50 [01:25<00:00,  1.72s/it]


fit_transform end: 2024-03-14 21:43:10.212200
                          ( inch - long ) loaf italian bread ( inch diameter ) , cut slice , inch thick brk large garlic clove , halve crosswise brk salt brk ground black pepper brk cup extra-virgin olive oil brk ( - pound ) ball fresh , salt mozzarella brk tablespoon prepare basil pesto ( see cook 's note ) brk tablespoon mascarpone , soften brk large pit black olive ( mediterranean - style can california ) , end trim olive cut crosswise third brk ( - ounce ) jar roast red bell pepper , drain , pat dry , cut thin strip  \
id                                                                                                                                                                                                                                                                                                                                                                                                                                       

phind70b code seems to turn each recipe into a column, based on the whole ingredient list

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 21:43:10.323194


100%|██████████| 50/50 [01:29<00:00,  1.78s/it]


fit_transform end: 2024-03-14 21:44:39.420875
                          cup chop onion  cup olive oil  cup sour cream  \
id                                                                        
54a4270b19925f464b37c1dc        0.000000       0.568345        0.000000   
54a42cde19925f464b3809d2        0.000000       0.000000        0.000000   
54a433036529d92b2c015de3        0.000000       0.000000        0.000000   
54a451926529d92b2c01eda8        0.000000       0.000000        0.000000   
54a430876529d92b2c013e2b        0.764889       0.000000        0.000000   
54a453df6529d92b2c020687        0.000000       0.000000        0.743765   
55b0e7116284773353bf4580        0.000000       0.000000        0.000000   
54a42bab6529d92b2c00ffa7        0.000000       0.000000        0.000000   
54a4748f19925f464b399ef2        0.000000       0.626831        0.000000   
54a4356a19925f464b3875bb        0.000000       0.626831        0.000000   
54a4697e6529d92b2c0279d3        0.000000       0.00000

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': groq_mixtral_itertools_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':1,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 21:44:39.466667


100%|██████████| 50/50 [01:28<00:00,  1.76s/it]


ValueError: empty vocabulary; perhaps the documents only contain stop words

Don't think I'm excluding stopwords in this pipeline...

Based on how fast the base sklearn n-gram creator is, it might just be better to do some text preprocessing and just ask sklearn to take over

eg, converting all words to lemmas, then converting/created tokens from the lemmas

example, spark creates n grams by taking things like "olive" and "oil" and making the bigram "olive_oil"

In [None]:
import nltk
def test_nltk_ngrams_per_line(ingredients_list):

    lowered = " brk ".join(
        map(str, [ingred for ingred in ingredients_list if ingred is not None])
    ).lower()

    if lowered is None:
        lowered = "Missing ingredients"

    preproc = nlp(lowered)

    lemmad = " ".join(
        map(
            str,
            [
                word.lemma
                for sent in preproc.sentences
                for word in sent.words
                if (
                    word.upos
                    not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                    and word is not None
                )
            ],
        )
    )

    split = lemmad.split(" brk ")

    yield nltk.everygrams(split, max_len=4)

In [None]:
print("n-gram creation start: " + str(datetime.now()))
small_input = [n_gram for n_gram in test_nltk_ngrams_per_line(model_input)]
print("n-gram creation end: " + str(datetime.now()))

n-gram creation start: 2024-03-14 21:47:20.593932
n-gram creation end: 2024-03-14 21:49:00.252510


In [None]:
print(small_input)

[<generator object everygrams>]


In [None]:
model_input

id
54a4270b19925f464b37c1dc    [1 1/2 cups white wine vinegar, 1/2 cup sugar,...
54a42cde19925f464b3809d2    [3 large fresh jalapeños (4 inches), sliced 1/...
54a433036529d92b2c015de3    [4 cups chopped red onions (about 2 large), 1 ...
54a451926529d92b2c01eda8    [1 pound chicken parts, 2 stalks celery, inclu...
54a430876529d92b2c013e2b    [2 tablespoons olive oil, 1 cup chopped onion,...
54a453df6529d92b2c020687    [3/4 cup granulated sugar, 2 1/2 cups all-purp...
55b0e7116284773353bf4580    [1 1/2 cups packed dark brown sugar, 1 cup kos...
54a42bab6529d92b2c00ffa7    [Organic unsweetened cocoa powder, for dusting...
54a4748f19925f464b399ef2    [1/2 cup olive oil, 6 tablespoons fresh lime j...
54a4356a19925f464b3875bb    [3 tablespoons white wine vinegar, 2 tablespoo...
54a4697e6529d92b2c0279d3    [24 chicken wings, 1/2 cup butter, 1 cup Louis...
54a45e426529d92b2c02488f    [9 cups 1/2- to 3/4-inch pieces French bread c...
54a452c96529d92b2c01f889    [4 teaspoons finely chopped rosem

In [None]:
joined_with_brk = model_input.str.join(" brk ")
joined_with_brk

id
54a4270b19925f464b37c1dc    1 1/2 cups white wine vinegar brk 1/2 cup suga...
54a42cde19925f464b3809d2    3 large fresh jalapeños (4 inches), sliced 1/8...
54a433036529d92b2c015de3    4 cups chopped red onions (about 2 large) brk ...
54a451926529d92b2c01eda8    1 pound chicken parts brk 2 stalks celery, inc...
54a430876529d92b2c013e2b    2 tablespoons olive oil brk 1 cup chopped onio...
54a453df6529d92b2c020687    3/4 cup granulated sugar brk 2 1/2 cups all-pu...
55b0e7116284773353bf4580    1 1/2 cups packed dark brown sugar brk 1 cup k...
54a42bab6529d92b2c00ffa7    Organic unsweetened cocoa powder, for dusting ...
54a4748f19925f464b399ef2    1/2 cup olive oil brk 6 tablespoons fresh lime...
54a4356a19925f464b3875bb    3 tablespoons white wine vinegar brk 2 tablesp...
54a4697e6529d92b2c0279d3    24 chicken wings brk 1/2 cup butter brk 1 cup ...
54a45e426529d92b2c02488f    9 cups 1/2- to 3/4-inch pieces French bread cu...
54a452c96529d92b2c01f889    4 teaspoons finely chopped rosema

In [None]:
joined_with_brk[joined_with_brk.isnull()]

Series([], Name: ingredients, dtype: object)

In [None]:
stanzad = nlp(joined_with_brk.tolist())

ValueError: If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object.  Got <class 'list'>

In [None]:
print([_ for _ in map(nlp, joined_with_brk)])

[[
  [
    {
      "id": 1,
      "text": "1",
      "lemma": "1",
      "upos": "NUM",
      "xpos": "CD",
      "feats": "NumForm=Digit|NumType=Card",
      "head": 3,
      "deprel": "nummod",
      "start_char": 0,
      "end_char": 1,
      "ner": "B-CARDINAL",
      "multi_ner": [
        "B-CARDINAL"
      ]
    },
    {
      "id": 2,
      "text": "1/2",
      "lemma": "1/2",
      "upos": "NUM",
      "xpos": "CD",
      "feats": "NumForm=Word|NumType=Card",
      "head": 1,
      "deprel": "compound",
      "start_char": 2,
      "end_char": 5,
      "ner": "E-CARDINAL",
      "multi_ner": [
        "E-CARDINAL"
      ]
    },
    {
      "id": 3,
      "text": "cups",
      "lemma": "cup",
      "upos": "NOUN",
      "xpos": "NNS",
      "feats": "Number=Plur",
      "head": 15,
      "deprel": "nmod:npmod",
      "start_char": 6,
      "end_char": 10,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "white",
      "lemma"

1. There is a normalize function that is performant on pandas dataframes, just call pandas.Series.str.normalize()
2. Can also lowercase with pandas.Series.str.lower()
Combined with str.join and str.split, may be able to speed things up for text pre processing

However, need to write a custom function that I can use either map or apply with. This custome function can do the Stanza lemmatization

In [None]:
test_transforms = model_input.str.join(" brk ").str.normalize('NFKC').str.lower().fillna("Missing ingredients")

In [None]:
stanza_pipelined = nlp(test_transforms[0])

In [None]:
def mini_stanza_function(row, stanza_pipeline):
    lemmafied = " ".join(str(word.lemma) 
                         for sent in stanza_pipeline(row).sentences
                         for word in sent.words
                         if (word.upos 
                             not in ["NUM", 
                                     "DET", 
                                     "ADV", 
                                     "CCONJ", 
                                     "ADP", 
                                     "SCONJ",
                                     "PUNCT"
                                     ]
                            and word is not None
                            )
                        )
    return lemmafied

In [None]:
mini_stanza_function(test_transforms[0], nlp)

'cup white wine vinegar brk cup sugar brk cup water brk turkish bay leave brk teaspoon dry crush red pepper brk coarse kosher salt brk pound red onion slice brk cup white wine vinegar brk teaspoon dijon mustard brk cup olive oil brk cup crumble blue cheese brk heart romaine quarter lengthwise crumble blue cheese garnish'

In [None]:
lemmafied = test_transforms.apply(lambda x: mini_stanza_function(x, nlp))

In [None]:
lemmafied

id
54a4270b19925f464b37c1dc    cup white wine vinegar brk cup sugar brk cup w...
54a42cde19925f464b3809d2    large fresh jalapeño inch slice inch thick brk...
54a433036529d92b2c015de3    cup chop red onion large brk tablespoon sunflo...
54a451926529d92b2c01eda8    pound chicken part brk stalk celery include le...
54a430876529d92b2c013e2b    tablespoon olive oil brk cup chop onion brk cu...
54a453df6529d92b2c020687    cup granulate sugar brk cup purpose flour brk ...
55b0e7116284773353bf4580    cup pack dark brown sugar brk cup kosher salt ...
54a42bab6529d92b2c00ffa7    organic unsweetened cocoa powder dust pan brk ...
54a4748f19925f464b399ef2    cup olive oil brk tablespoon fresh lime juice ...
54a4356a19925f464b3875bb    tablespoon white wine vinegar brk tablespoon c...
54a4697e6529d92b2c0279d3    chicken wing brk cup butter brk cup louisiana ...
54a45e426529d92b2c02488f    cup inch piece French bread cube crust ounce b...
54a452c96529d92b2c01f889    teaspoon chop rosemary brk teaspo

In [None]:
def test_ngrams_maker(min_ngram_length, max_ngram_length):
    def ngrams_per_line(row):
        for ln in row.split(" brk "):
            at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b"
            terms = re.findall(at_least_two_english_characters_whole_words, ln)
            for ngramLength in range(min_ngram_length, max_ngram_length):

                # find and return all ngrams
                # for ngram in zip(*[terms[i:] for i in range(3)]):
                # <-- solution without a generator (works the same but has higher memory usage)
                for ngram in (word for i in range(len(terms) - ngramLength + 1) for word in (" ".join(terms[i:i+ngramLength]),)):
                    yield ngram
    return ngrams_per_line

In [None]:
ngramfied = lemmafied.apply(lambda x: test_ngrams_maker(x, 1, 4))

In [None]:
ngramfied

id
54a4270b19925f464b37c1dc    <generator object test_ngrams_maker at 0x7f2d1...
54a42cde19925f464b3809d2    <generator object test_ngrams_maker at 0x7f2d1...
54a433036529d92b2c015de3    <generator object test_ngrams_maker at 0x7f2d1...
54a451926529d92b2c01eda8    <generator object test_ngrams_maker at 0x7f2d1...
54a430876529d92b2c013e2b    <generator object test_ngrams_maker at 0x7f2d1...
54a453df6529d92b2c020687    <generator object test_ngrams_maker at 0x7f2d1...
55b0e7116284773353bf4580    <generator object test_ngrams_maker at 0x7f2d1...
54a42bab6529d92b2c00ffa7    <generator object test_ngrams_maker at 0x7f2d1...
54a4748f19925f464b399ef2    <generator object test_ngrams_maker at 0x7f2d1...
54a4356a19925f464b3875bb    <generator object test_ngrams_maker at 0x7f2d1...
54a4697e6529d92b2c0279d3    <generator object test_ngrams_maker at 0x7f2d1...
54a45e426529d92b2c02488f    <generator object test_ngrams_maker at 0x7f2d1...
54a452c96529d92b2c01f889    <generator object test_ngrams_mak

In [None]:
lemmafied[0]

'cup white wine vinegar brk cup sugar brk cup water brk turkish bay leave brk teaspoon dry crush red pepper brk coarse kosher salt brk pound red onion slice brk cup white wine vinegar brk teaspoon dijon mustard brk cup olive oil brk cup crumble blue cheese brk heart romaine quarter lengthwise crumble blue cheese garnish'

In [None]:
print([ln for ln in lemmafied[0].split(" brk ")])

['cup white wine vinegar', 'cup sugar', 'cup water', 'turkish bay leave', 'teaspoon dry crush red pepper', 'coarse kosher salt', 'pound red onion slice', 'cup white wine vinegar', 'teaspoon dijon mustard', 'cup olive oil', 'cup crumble blue cheese', 'heart romaine quarter lengthwise crumble blue cheese garnish']


In [None]:
for ln in lemmafied[0].split(" brk "):
    # tokenize the input string (customize the regex as desired)
    # at_least_two_english_characters_whole_words = u"(?u)\b[a-zA-Z]{2,}\b"
    at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b"
    # at_least_two_english_characters_whole_words = u'(?u)\\b\\w+\\b'

    terms = re.findall(at_least_two_english_characters_whole_words, ln)
    print("terms")
    print(terms)
    print("\n")
    for ngramLength in range(1, 5):

        # find and return all ngrams
        # for ngram in zip(*[terms[i:] for i in range(3)]):
        # <-- solution without a generator (works the same but has higher memory usage)
        print('ngrams')
        for ngram in zip(
            *[
                islice(seq, i, len(terms))
                for i, seq in enumerate(tee(terms, ngramLength))
            ]
        ):  # <-- solution using a generator

            ngram = " ".join(ngram)
            print(ngram)
        print("\n")

terms
['cup', 'white', 'wine', 'vinegar']


ngrams
cup
white
wine
vinegar


ngrams
cup white
white wine
wine vinegar


ngrams
cup white wine
white wine vinegar


ngrams
cup white wine vinegar


terms
['cup', 'sugar']


ngrams
cup
sugar


ngrams
cup sugar


ngrams


ngrams


terms
['cup', 'water']


ngrams
cup
water


ngrams
cup water


ngrams


ngrams


terms
['turkish', 'bay', 'leave']


ngrams
turkish
bay
leave


ngrams
turkish bay
bay leave


ngrams
turkish bay leave


ngrams


terms
['teaspoon', 'dry', 'crush', 'red', 'pepper']


ngrams
teaspoon
dry
crush
red
pepper


ngrams
teaspoon dry
dry crush
crush red
red pepper


ngrams
teaspoon dry crush
dry crush red
crush red pepper


ngrams
teaspoon dry crush red
dry crush red pepper


terms
['coarse', 'kosher', 'salt']


ngrams
coarse
kosher
salt


ngrams
coarse kosher
kosher salt


ngrams
coarse kosher salt


ngrams


terms
['pound', 'red', 'onion', 'slice']


ngrams
pound
red
onion
slice


ngrams
pound red
red onion
onion slice


ngra

In [None]:
test_ngrams_maker(lemmafied[0], 1, 4)

<generator object test_ngrams_maker>

In [None]:
list(test_ngrams_maker(lemmafied[0], 1, 4))

['',
 ' ',
 ' a ',
 ' ',
 ' ',
 ' n-',
 ' ',
 '.',
 '  ',
 '   a ',
 ' a   ',
 '   ',
 '   n-',
 ' n-  ',
 '  .',
 '    a ',
 '   a   ',
 ' a     ',
 '     n-',
 '   n-  ',
 ' n-   .',
 '    a   ',
 '   a     ',
 ' a       n-',
 '     n-  ',
 '   n-   .']

In [None]:
" ".join(str(word.lemma) 
         for sent in nlp(test_transforms).sentences 
         for word in sent.words 
         if (word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                            and word is not None
                            ))

AssertionError: input should be either str, list or Document

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': test_ngrams_maker(
        min_ngram_length=1,
        max_ngram_length=4
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 23:24:12.764496


  0%|          | 0/50 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'split'

In [None]:
nlp2 = stanza.Pipeline('en', 
                    depparse_batch_size=50, 
                    depparse_min_length_to_batch_separately=50,
                    verbose=True,
                    use_gpu=True, # set to true when on cloud/not on streaming computer
                    batch_size=100
                    )


2024-03-14 21:49:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-03-14 21:49:35 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2024-03-14 21:49:35 INFO: Using device: cuda
2024-03-14 21:49:35 INFO: Loading: tokenize
2024-03-14 21:49:38 INFO: Loading: pos
2024-03-14 21:49:38 INFO: Loading: lemma
2024-03-14 21:49:38 INFO: Loading: constituency
2024-03-14 21:49:39 INFO: Loading: depparse
2024-03-14 21:49:39 INFO: Loading: sentiment
2024-03-14 21:49:39 INFO: Loading: ner
2024-03-14 21:49:40 INFO: Done loading processors!


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp2,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-14 21:49:43.334964


100%|██████████| 50/50 [00:24<00:00,  2.07it/s]


fit_transform end: 2024-03-14 21:50:07.457198
                          cup chop onion  cup olive oil  cup sour cream  \
id                                                                        
54a4270b19925f464b37c1dc        0.000000       0.568345        0.000000   
54a42cde19925f464b3809d2        0.000000       0.000000        0.000000   
54a433036529d92b2c015de3        0.000000       0.000000        0.000000   
54a451926529d92b2c01eda8        0.000000       0.000000        0.000000   
54a430876529d92b2c013e2b        0.764889       0.000000        0.000000   
54a453df6529d92b2c020687        0.000000       0.000000        0.743765   
55b0e7116284773353bf4580        0.000000       0.000000        0.000000   
54a42bab6529d92b2c00ffa7        0.000000       0.000000        0.000000   
54a4748f19925f464b399ef2        0.000000       0.626831        0.000000   
54a4356a19925f464b3875bb        0.000000       0.626831        0.000000   
54a4697e6529d92b2c0279d3        0.000000       0.00000

In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    # 'strip_accents':"unicode",
    # 'lowercase':True,
    'analyzer': CustomSKLearnAnalyzer().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':3,
    'binary':True
}

# bertopic_params are a superset of cv_params
# bertopic_params = {
#     'top_n_words':20,
#     'min_topic_size':5,
#     'nr_topics':'auto',
#     'verbose':True,
#     'low_memory':True,
#     'calculate_probabilities':True
# }

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'OneHotEncoder'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_transformer_params)
# pipeline_params.update(bertopic_params)

# signature = infer_signature(model_input=to_nlp_df['ingredients'],
#                             )

with mlflow.start_run(experiment_id=mlflow_exp_id):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    # Will be useful in STAGING/Evaluation
    
    # LOG MODEL
    # Instantiate sklearn OneHotEncoder
    sklearn_transformer = CountVectorizer(**sklearn_transformer_params)

    print('\n')
    print('-' * 80)
    print('sklearn fit transform on ingredients:', end='\n')

    model_input = to_nlp_df['ingredients']

    print('\n')
    print('-' * 80)
    print('Input Data: ', end='\n')
    print(model_input)

    print('\n')
    print('-' * 80)
    print('Input Data Shape: ', end='\n')
    print(model_input.shape)

    print('\n')
    print('-' * 80)
    print('Random 3 Records from Input Data: ', end='\n')
    print(model_input.sample(3, random_state=200))

    # Do fit transform on data
    response = sklearn_transformer.fit_transform(tqdm(model_input)) 
    
    transformed_recipe = pd.DataFrame(
            response.toarray(),
            columns=sklearn_transformer.get_feature_names_out(),
            index=model_input.index
    )

    signature = infer_signature(model_input=model_input,
                                model_output=transformed_recipe
                                )

    print('\n')
    print('-' * 80)
    print('Transformed Data:', end='\n')
    print(transformed_recipe)
    
    # mlflow.pyfunc.save_model(
    #     path=model_directory,
    #     code_path=["../src/"],
    #     python_model=CustomSKLearnWrapper(),
    #     input_example=to_nlp_df['ingredients'][0],    
    #     artifacts=artifacts
    # )

     # joblib.dump(sklearn_transformer, sklearn_transformer_path)
    with open(sklearn_transformer_path, "wb") as fo:
        pickle.dump(sklearn_transformer, fo)
        # mlflow.log_artifact(sklearn_transformer_path,
        #                     artifact_path='sklearn_transformer')

    # joblib.dump(transformed_recipe, transformed_recipes_path)
    with open(transformed_recipes_path, "wb") as fo:
        pickle.dump(transformed_recipe, fo)
        # mlflow.log_artifact(transformed_recipes_path,
        #                     artifact_path='transformed_recipes')


    model_info = mlflow.pyfunc.log_model( 
        code_path=["../src/"],
        python_model=CustomSKLearnWrapper(),
        input_example=to_nlp_df['ingredients'][0],
        signature=signature,        
        artifact_path="sklearn_model",
        artifacts=artifacts
        ) 

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    
    

In [None]:
response

In [None]:
test_predictor = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)

In [None]:
# pre_proc_df is cleaned dataframe
pre_proc_test_df = dfpp.preprocess_dataframe(test_df)
print('\n')
print('--------------')
print('Preprocessed Dataframe: ', end='\n')
print(pre_proc_test_df.head())
print(pre_proc_test_df.shape)

# create subset for dev purposes
# to_nlp_test_df = pre_proc_test_df
# print('\n')
# print('-' * 80)
# print('Subset Dataframe:', end='\n')
# print(to_nlp_test_df.head())
# print(to_nlp_test_df.shape)

test_model_input = pre_proc_test_df['ingredients']

In [None]:
test_model_input

In [None]:
test_model_input.shape

In [None]:
test_model_input.values

In [None]:
model_info.signature.to_dict()

In [None]:
test_predictor.predict(test_model_input)

In [None]:
print('\n')
print('-' * 80)
print('Input Data: ', end='\n')
print(test_model_input)

print('\n')
print('-' * 80)
print('Input Data Shape: ', end='\n')
print(test_model_input.shape)

print('\n')
print('-' * 80)
print('Random 3 Records from Input Data: ', end='\n')
print(test_model_input.sample(3, random_state=200))

# test_response = sklearn_transformer.transform(tqdm(test_model_input)) 
test_response = sklearn_transformer.transform(test_model_input)
    
    
test_transformed_recipe = pd.DataFrame(
            test_response.toarray(),
            columns=sklearn_transformer.get_feature_names_out(),
            index=test_model_input.index
    )

In [None]:
type(test_predictor)

In [None]:
test_transformed_recipe