# | default_exp core

In [None]:
# | hide
# from bertopic import BERTopic
# from bertopic.vectorizers import OnlineCountVectorizer
import dagshub
from datetime import datetime
import dill as pickle
import dvc.api
# from hdbscan import HDBSCAN
from itertools import tee, islice, product
import joblib
# import mlflow
# from mlflow.models import infer_signature
import nbdev
from nbdev.showdoc import *
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import (
    CountVectorizer
    , TfidfTransformer
    , TfidfVectorizer
    , 
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
# from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer
# from src.custom_stanza_mlflow import CustomSKLearnWrapper
import src.dataframe_preprocessor as dfpp
import stanza
from tqdm import tqdm
# from umap import UMAP

In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'

# | export

In [None]:
def stanza_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # print("Stanza processing start: " + str(datetime.now()))
            
            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # print("Stanza processing end/lemmatizing start: " + str(datetime.now()))
            
            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            
            # print("Stanza lemmatizing end: " + str(datetime.now()))

            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):

                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngramLength in range(minNgramLength, maxNgramLength + 1):

                    # find and return all ngrams
                    # for ngram in zip(*[terms[i:] for i in range(3)]):
                    # <-- solution without a generator (works the same but has higher memory usage)
                    for ngram in zip(
                        *[
                            islice(seq, i, len(terms))
                            for i, seq in enumerate(tee(terms, ngramLength))
                        ]
                    ):  # <-- solution using a generator

                        ngram = " ".join(map(str, ngram))
                        yield ngram

        # print("n_gram creation end: " + str(datetime.now()))
        return ngrams_per_line

In [None]:
def groq_mixtral_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):

                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngram_length in range(minNgramLength, maxNgramLength + 1):
                    ngrams = zip(*[seq[i:] for i in range(ngram_length)])
                    for ngram in map(" ".join, ngrams):
                        yield ngram

        # print("n_gram creation end: " + str(datetime.now()))
        return ngrams_per_line

In [None]:
def groq_llama2_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):
                terms = re.split(r"\b[a-zA-Z]{2,}\b", ln)
                for ngram_length in range(minNgramLength, maxNgramLength + 1):
                    for ngram in product(*[terms[i:] for i in range(ngram_length)]):
                        yield " ".join(map(str, ngram))

        # print("n_gram creation end: " + str(datetime.now()))
        return ngrams_per_line

In [None]:
def claude_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):
                at_least_two_english_characters_whole_words = r"(?u)\b\w{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                for ngram_length in range(minNgramLength, maxNgramLength + 1):
                    # Use itertools.tee to create multiple independent iterators
                    iterators = tee(terms, ngram_length)

                    # Use itertools.islice to slice each iterator
                    for i, it in enumerate(iterators):
                        iterators[i] = islice(it, i, len(terms))

                    # Yield n-grams of the desired length
                    yield from (" ".join(ngram) for ngram in zip(*iterators))
                    
        return ngrams_per_line

In [None]:
def phind70b_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # Suspicion that this is O(n^2)
            lemmad = " ".join(
                 map(
                      str, 
                        [
                           word.lemma
                           for sent in preproc.sentences
                           for word in sent.words
                           if (
                                word.upos
                                not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                                and word is not None
                                )
                        ],
                 )
            )
                        
            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lemmad.split(" brk "):
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, lemmad)

                # Loop n-gram creation for every number between min and max n-gram length
                for ngramLength in range(minNgramLength, maxNgramLength + 1):
                    # Use a sliding window approach to generate n-grams
                    for i in range(len(terms) - ngramLength + 1):
                        ngram = " ".join(terms[i:i+ngramLength])
                        yield ngram
                    
        return ngrams_per_line

In [None]:
def generate_ngrams(lemmad, minNgramLength, maxNgramLength):
    # Tokenize the input string
    at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
    terms = re.split(at_least_two_english_characters_whole_words, lemmad)
    print(terms)

    # Loop n-gram creation for every number between min and max n-gram length
    for ngramLength in range(minNgramLength, maxNgramLength + 1):
        # Use a sliding window approach to generate n-grams
        for i in range(len(terms) - ngramLength + 1):
            ngram = " ".join(terms[i:i+ngramLength])
            print(ngram)
            yield ngram


In [None]:
# Example usage
lemmad = "This is a test sentence for n-gram generation."
minNgramLength = 2
maxNgramLength = 3

for ngram in generate_ngrams(lemmad, minNgramLength, maxNgramLength):
    print(ngram)

# didn't print anything

In [None]:
# The print statements will be executed whenever the generator is iterated over. If you just want to see what the iterator prints, you can call it in a list comprehension without saving the result.

lemmad = "This is a test sentence for n-gram generation."
minNgramLength = 1
maxNgramLength = 3

[ngram for ngram in generate_ngrams(lemmad, minNgramLength, maxNgramLength)]
    

['This is a test sentence for n-gram generation.']
This is a test sentence for n-gram generation.


['This is a test sentence for n-gram generation.']

In [None]:
def stanza_analyzer_no_lemma(stanza_pipeline, minNgramLength, maxNgramLength):
        """
        Custom ngram analyzer function, matching only ngrams that belong to the same line

        The source for this was StackOverflow because I couldn't figure out how to let sklearn pipelines use arguments for custom analyzers

        Use this as the analyzer for an sklearn pipeline, and it should work

        Args:
            stanza_pipeline: Stanza pipeline
            minNgramLength: integer for the minimum ngram (usually 1)
            maxNgramLength: integer for maximum length ngram (usually should not exceed 4)

        Returns:
            A function that will be used in sklearn pipeline. Said function yields a generator

        """
        # print("text processing start: " + str(datetime.now()))

        def ngrams_per_line(word_list):
            # Time O(n)
            lowered = " brk ".join(
                 map(str,
                     [word
                      if word is not None
                      else ''
                      for word in word_list
                      ]
                    )
            ).lower()

            if lowered is None:
                lowered = "Missing ingredients"

            
            # Time O(n)
            preproc = stanza_pipeline(lowered)

            # print("Stanza processing end/lemmatizing start: " + str(datetime.now()))
            
            # Suspicion that this is O(n^2)
            # lemmad = " ".join(
            #      map(
            #           str, 
            #             [
            #                word.lemma
            #                for sent in preproc.sentences
            #                for word in sent.words
            #                if (
            #                     word.upos
            #                     not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
            #                     and word is not None
            #                     )
            #             ],
            #      )
            # )
                        
            
            # print("Stanza lemmatizing end: " + str(datetime.now()))

            # analyze each line of the input string seperately
            # Time O(word^ngram max)
            for ln in lowered.split(" brk "):

                # tokenize the input string (customize the regex as desired)
                at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
                terms = re.split(at_least_two_english_characters_whole_words, ln)

                # loop ngram creation for every number between min and max ngram length
                for ngramLength in range(minNgramLength, maxNgramLength + 1):

                    # find and return all ngrams
                    # for ngram in zip(*[terms[i:] for i in range(3)]):
                    # <-- solution without a generator (works the same but has higher memory usage)
                    for ngram in zip(
                        *[
                            islice(seq, i, len(terms))
                            for i, seq in enumerate(tee(terms, ngramLength))
                        ]
                    ):  # <-- solution using a generator

                        ngram = " ".join(map(str, ngram))
                        yield ngram

        return ngrams_per_line

In [None]:
# def custom_analyzer(word_list, stanza_pipeline, minNgramLength, maxNgramLength, lemmatize=True):
#     lowered = " brk ".join([word.lower() for word in word_list if word is not None])

#     preproc = stanza_pipeline(lowered)
    
#     if lemmatize:
#         lemmad = " ".join(map(str,
#                             [word.lemma
#                             for sent in preproc.sentences 
#                             for word in sent.words if (
#                                 word.upos not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ", "PUNCT"]
#                                 and word is not None
#                             )]
#                         )
#                     )
#     else:
#         lemmad = " ".join(map(str,
#                             [word.text
#                             for sent in preproc.sentences 
#                             for word in sent.words if (
#                                 word is not None
#                             )]
#                         )
#                     )
#     # analyze each line of the input string seperately
#     for ln in lemmad.split(' brk '):
#         # tokenize the input string (customize the regex as desired)
#         at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
#         terms = re.split(at_least_two_english_characters_whole_words, ln)

#         # loop ngram creation for every number between min and max ngram length
#         for ngramLength in range(minNgramLength, maxNgramLength+1):

#             # find and return all ngrams
#             # for ngram in zip(*[terms[i:] for i in range(3)]): 
#                 # <-- solution without a generator (works the same but has higher memory usage)
#             for ngram in zip(*[islice(seq, i, len(terms)) for i, seq in enumerate(tee(terms, ngramLength))]):   # <-- solution using a generator
                
#                 ngram = ' '.join(map(str, ngram))
#                 # yield ngram
#                 return str(ngram)


In [None]:
# new groq/mixtral suggestion by using entire function with external/internal
import more_itertools

def stanza_analyzer(stanza_pipeline, minNgramLength, maxNgramLength):

    def ngrams_per_line(ingredients_list):

        lowered = " brk ".join(
                map(str, [ingred for ingred in ingredients_list if ingred is not None])
            ).lower()

        if lowered is None:
            lowered = "Missing ingredients"

        preproc = stanza_pipeline(lowered)

        lemmad = " ".join(
            map(
                str,
                [
                    word.lemma
                    for sent in preproc.sentences
                    for word in sent.words
                    if (
                        word.upos
                        not in ["NUM", "DET", "ADV", "CCONJ", "ADP", "SCONJ"]
                        and word is not None
                    )
                ],
            )
        )

        for ln in lemmad.split(" brk "):

            at_least_two_english_characters_whole_words = "(?u)\b[a-zA-Z]{2,}\b"
            terms = [
                term for term in re.split(at_least_two_english_characters_whole_words, ln) if term
            ]

            for i in range(len(terms) - maxNgramLength + 1):
                ngram = terms[i : i + maxNgramLength]
                ngram = " ".join(ngram)
                yield ngram

    return ngrams_per_line

ModuleNotFoundError: No module named 'more_itertools'

In [None]:
# | hide
nbdev.nbdev_export()

### Data Preparation

In [None]:
# instantiate stanza pipeline
stanza.download('en')
nlp = stanza.Pipeline('en', 
                    depparse_batch_size=50, 
                    depparse_min_length_to_batch_separately=50,
                    verbose=True,
                    use_gpu=False, # set to true when on cloud/not on streaming computer
                    batch_size=100
                    )

# load raw data and preprocess/clean
data = dvc.api.read(
    path='../data/recipes-en-201706/epicurious-recipes_m2.json'
    , mode='r')
raw_df = pd.read_json(data)
print('\n')
print('--------------')
print('Raw Dataframe:', end='\n')
print(raw_df.head())
print(raw_df.shape)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-03-04 22:26:42 INFO: Downloading default packages for language: en (English) ...
2024-03-04 22:26:42 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2024-03-04 22:26:45 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2024-03-04 22:26:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-03-04 22:26:46 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2024-03-04 22:26:46 INFO: Using device: cpu
2024-03-04 22:26:46 INFO: Loading: tokenize
2024-03-04 22:26:46 INFO: Loading: pos
2024-03-04 22:26:46 INFO: Loading: lemma
2024-03-04 22:26:47 INFO: Loading: constituency
2024-03-04 22:26:47 INFO: Loading: depparse
2024-03-04 22:26:47 INFO: Loading: sentiment
2024-03-04 22:26:47 INFO: Loading: ner
2024-03-04 22:26:48 INFO: Done loading processors!




--------------
Raw Dataframe:
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000Z   

                       

In [None]:
# take sample and train/test split 
subset_df = raw_df.sample(n=100, random_state=45)
train_df, test_df = train_test_split(subset_df,test_size=0.5, random_state=45)

# pre_proc_df is cleaned dataframe
to_nlp_df = dfpp.preprocess_dataframe(train_df)
print('\n')
print('--------------')
print('Preprocessed Dataframe:', end='\n')
print(to_nlp_df.head())
print(to_nlp_df.shape)

# create subset for dev purposes
# to_nlp_df = pre_proc_df
# print('\n')
# print('-' * 80)
# print('Subset Dataframe:', end='\n')
# print(to_nlp_df.head())
# print(to_nlp_df.shape)



--------------
Preprocessed Dataframe:
                                                                        dek  \
id                                                                            
54a4270b19925f464b37c1dc                                                      
54a42cde19925f464b3809d2  Green chiles pickled in soy sauce and vinegar ...   
54a433036529d92b2c015de3  This soup features the flavors of India: aroma...   
54a451926529d92b2c01eda8                                                      
54a430876529d92b2c013e2b  Brown sugar and molasses are balanced by fresh...   

                                                                        hed  \
id                                                                            
54a4270b19925f464b37c1dc  Grilled Hearts of Romaine with Blue Cheese Vin...   
54a42cde19925f464b3809d2                              Soy-Pickled Jalapeños   
54a433036529d92b2c015de3  Curried Potato and Spinach Soup with Onion Sal...   
54a4519265

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=1
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)




--------------------------------------------------------------------------------
sklearn fit transform on ingredients:


--------------------------------------------------------------------------------
Input Data: 
id
54a4270b19925f464b37c1dc    [1 1/2 cups white wine vinegar, 1/2 cup sugar,...
54a42cde19925f464b3809d2    [3 large fresh jalapeños (4 inches), sliced 1/...
54a433036529d92b2c015de3    [4 cups chopped red onions (about 2 large), 1 ...
54a451926529d92b2c01eda8    [1 pound chicken parts, 2 stalks celery, inclu...
54a430876529d92b2c013e2b    [2 tablespoons olive oil, 1 cup chopped onion,...
54a453df6529d92b2c020687    [3/4 cup granulated sugar, 2 1/2 cups all-purp...
55b0e7116284773353bf4580    [1 1/2 cups packed dark brown sugar, 1 cup kos...
54a42bab6529d92b2c00ffa7    [Organic unsweetened cocoa powder, for dusting...
54a4748f19925f464b399ef2    [1/2 cup olive oil, 6 tablespoons fresh lime j...
54a4356a19925f464b3875bb    [3 tablespoons white wine vinegar, 2 tablespoo...


100%|██████████| 50/50 [01:32<00:00,  1.85s/it]


fit_transform end: 2024-03-04 22:28:22.851151


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer_no_lemma(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=1
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


text processing start: 2024-03-04 22:28:22.895239
n_gram creation end: 2024-03-04 22:28:22.897362
fit_transform start: 2024-03-04 22:28:22.898495


100%|██████████| 50/50 [01:31<00:00,  1.82s/it]


fit_transform end: 2024-03-04 22:29:53.934279


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': 'word',
    'ngram_range': (1,1),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients'].apply(" ".join).str.lower()

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

print("sklearn fit transform start: " + str(datetime.now()))
# Do fit transform on data
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("sklearn fit transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


sklearn fit transform start: 2024-03-04 22:29:53.963692


100%|██████████| 50/50 [00:00<00:00, 26924.53it/s]


sklearn fit transform end: 2024-03-04 22:29:53.972823


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=2
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


fit_transform start: 2024-03-04 22:29:54.029810


100%|██████████| 50/50 [01:30<00:00,  1.82s/it]


fit_transform end: 2024-03-04 22:31:24.930179


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer_no_lemma(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=2
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


text processing start: 2024-03-04 22:31:24.957531
n_gram creation end: 2024-03-04 22:31:24.959358
fit_transform start: 2024-03-04 22:31:24.960556


100%|██████████| 50/50 [01:32<00:00,  1.85s/it]


fit_transform end: 2024-03-04 22:32:57.716301


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': 'word',
    'ngram_range': (1,2),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients'].apply(" ".join).str.lower()

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

print("sklearn fit transform start: " + str(datetime.now()))

# Do fit transform on data
response = sklearn_transformer.fit_transform(tqdm(model_input)) 

print("sklearn fit transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


sklearn fit transform start: 2024-03-04 22:32:57.750064


100%|██████████| 50/50 [00:00<00:00, 12968.60it/s]


sklearn fit transform end: 2024-03-04 22:32:57.763559


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=3
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)


fit_transform start: 2024-03-04 22:32:57.797788


100%|██████████| 50/50 [01:34<00:00,  1.88s/it]


fit_transform end: 2024-03-04 22:34:31.891060


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': stanza_analyzer_no_lemma(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=3
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients']

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)

print(transformed_recipe)
print(transformed_recipe.columns)

text processing start: 2024-03-04 22:34:31.913107
n_gram creation end: 2024-03-04 22:34:31.914981
fit_transform start: 2024-03-04 22:34:31.916175


100%|██████████| 50/50 [01:32<00:00,  1.84s/it]


fit_transform end: 2024-03-04 22:36:04.031638


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': 'word',
    'ngram_range': (1,3),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

# print('\n')
# print('-' * 80)
# print('sklearn fit transform on ingredients:', end='\n')

model_input = to_nlp_df['ingredients'].apply(" ".join).str.lower()

# print('\n')
# print('-' * 80)
# print('Input Data: ', end='\n')
# print(model_input)

# print('\n')
# print('-' * 80)
# print('Input Data Shape: ', end='\n')
# print(model_input.shape)

# print('\n')
# print('-' * 80)
# print('Random 3 Records from Input Data: ', end='\n')
# print(model_input.sample(3, random_state=200))

print("sklearn fit transform start: " + str(datetime.now()))

# Do fit transform on data
response = sklearn_transformer.fit_transform(tqdm(model_input)) 

print("sklearn fit transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)

print(transformed_recipe)
print(transformed_recipe.columns)

sklearn fit transform start: 2024-03-04 22:36:04.063523


100%|██████████| 50/50 [00:00<00:00, 9913.27it/s]


sklearn fit transform end: 2024-03-04 22:36:04.077720


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    'analyzer': groq_llama2_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-07 17:46:57.329897


100%|██████████| 50/50 [04:07<00:00,  4.94s/it]


fit_transform end: 2024-03-07 17:51:04.448496


In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': claude_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':3,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-07 23:45:50.553590


  0%|          | 0/50 [00:07<?, ?it/s]


TypeError: 'tuple' object does not support item assignment

Claude code could maybe be fixable if I can find why it's trying to reassign a tuple

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': phind70b_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':1,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-07 23:54:10.206819


100%|██████████| 50/50 [08:32<00:00, 10.25s/it]


fit_transform end: 2024-03-08 00:02:42.814559
                          ( inch - long ) loaf italian bread ( inch diameter ) , cut slice , inch thick brk large garlic clove , halve crosswise brk salt brk ground black pepper brk cup extra-virgin olive oil brk ( - pound ) ball fresh , salt mozzarella brk tablespoon prepare basil pesto ( see cook 's note ) brk tablespoon mascarpone , soften brk large pit black olive ( mediterranean - style can california ) , end trim olive cut crosswise third brk ( - ounce ) jar roast red bell pepper , drain , pat dry , cut thin strip  \
id                                                                                                                                                                                                                                                                                                                                                                                                                                       

phind70b code seems to turn each recipe into a column, based on the whole ingredient list

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': stanza_analyzer(
        stanza_pipeline=nlp,
        minNgramLength=1,
        maxNgramLength=4
        ),
    'min_df':1,
    'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)
print(transformed_recipe)
print(transformed_recipe.columns)

fit_transform start: 2024-03-08 00:05:43.142550


100%|██████████| 50/50 [08:10<00:00,  9.80s/it]


fit_transform end: 2024-03-08 00:13:53.372683
                          " - thick slice country - style bread  \
id                                                                
54a4270b19925f464b37c1dc                               0.000000   
54a42cde19925f464b3809d2                               0.000000   
54a433036529d92b2c015de3                               0.000000   
54a451926529d92b2c01eda8                               0.000000   
54a430876529d92b2c013e2b                               0.000000   
54a453df6529d92b2c020687                               0.000000   
55b0e7116284773353bf4580                               0.000000   
54a42bab6529d92b2c00ffa7                               0.000000   
54a4748f19925f464b399ef2                               0.000000   
54a4356a19925f464b3875bb                               0.000000   
54a4697e6529d92b2c0279d3                               0.000000   
54a45e426529d92b2c02488f                               0.000000   
54a452c96529d92b

In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    # 'strip_accents':"unicode",
    # 'lowercase':True,
    'analyzer': CustomSKLearnAnalyzer().stanza_analyzer(stanza_pipeline=nlp, minNgramLength=1, maxNgramLength=4),
    'min_df':3,
    'binary':True
}

# bertopic_params are a superset of cv_params
# bertopic_params = {
#     'top_n_words':20,
#     'min_topic_size':5,
#     'nr_topics':'auto',
#     'verbose':True,
#     'low_memory':True,
#     'calculate_probabilities':True
# }

# update bertopic_params to include cv_params
# bertopic_params.update(cv_params)

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'OneHotEncoder'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_transformer_params)
# pipeline_params.update(bertopic_params)

# signature = infer_signature(model_input=to_nlp_df['ingredients'],
#                             )

with mlflow.start_run(experiment_id=mlflow_exp_id):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    # Will be useful in STAGING/Evaluation
    
    # LOG MODEL
    # Instantiate sklearn OneHotEncoder
    sklearn_transformer = CountVectorizer(**sklearn_transformer_params)

    print('\n')
    print('-' * 80)
    print('sklearn fit transform on ingredients:', end='\n')

    model_input = to_nlp_df['ingredients']

    print('\n')
    print('-' * 80)
    print('Input Data: ', end='\n')
    print(model_input)

    print('\n')
    print('-' * 80)
    print('Input Data Shape: ', end='\n')
    print(model_input.shape)

    print('\n')
    print('-' * 80)
    print('Random 3 Records from Input Data: ', end='\n')
    print(model_input.sample(3, random_state=200))

    # Do fit transform on data
    response = sklearn_transformer.fit_transform(tqdm(model_input)) 
    
    transformed_recipe = pd.DataFrame(
            response.toarray(),
            columns=sklearn_transformer.get_feature_names_out(),
            index=model_input.index
    )

    signature = infer_signature(model_input=model_input,
                                model_output=transformed_recipe
                                )

    print('\n')
    print('-' * 80)
    print('Transformed Data:', end='\n')
    print(transformed_recipe)
    
    # mlflow.pyfunc.save_model(
    #     path=model_directory,
    #     code_path=["../src/"],
    #     python_model=CustomSKLearnWrapper(),
    #     input_example=to_nlp_df['ingredients'][0],    
    #     artifacts=artifacts
    # )

     # joblib.dump(sklearn_transformer, sklearn_transformer_path)
    with open(sklearn_transformer_path, "wb") as fo:
        pickle.dump(sklearn_transformer, fo)
        # mlflow.log_artifact(sklearn_transformer_path,
        #                     artifact_path='sklearn_transformer')

    # joblib.dump(transformed_recipe, transformed_recipes_path)
    with open(transformed_recipes_path, "wb") as fo:
        pickle.dump(transformed_recipe, fo)
        # mlflow.log_artifact(transformed_recipes_path,
        #                     artifact_path='transformed_recipes')


    model_info = mlflow.pyfunc.log_model( 
        code_path=["../src/"],
        python_model=CustomSKLearnWrapper(),
        input_example=to_nlp_df['ingredients'][0],
        signature=signature,        
        artifact_path="sklearn_model",
        artifacts=artifacts
        ) 

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    
    

In [None]:
response

In [None]:
test_predictor = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)

In [None]:
# pre_proc_df is cleaned dataframe
pre_proc_test_df = dfpp.preprocess_dataframe(test_df)
print('\n')
print('--------------')
print('Preprocessed Dataframe: ', end='\n')
print(pre_proc_test_df.head())
print(pre_proc_test_df.shape)

# create subset for dev purposes
# to_nlp_test_df = pre_proc_test_df
# print('\n')
# print('-' * 80)
# print('Subset Dataframe:', end='\n')
# print(to_nlp_test_df.head())
# print(to_nlp_test_df.shape)

test_model_input = pre_proc_test_df['ingredients']

In [None]:
test_model_input

In [None]:
test_model_input.shape

In [None]:
test_model_input.values

In [None]:
model_info.signature.to_dict()

In [None]:
test_predictor.predict(test_model_input)

In [None]:
print('\n')
print('-' * 80)
print('Input Data: ', end='\n')
print(test_model_input)

print('\n')
print('-' * 80)
print('Input Data Shape: ', end='\n')
print(test_model_input.shape)

print('\n')
print('-' * 80)
print('Random 3 Records from Input Data: ', end='\n')
print(test_model_input.sample(3, random_state=200))

# test_response = sklearn_transformer.transform(tqdm(test_model_input)) 
test_response = sklearn_transformer.transform(test_model_input)
    
    
test_transformed_recipe = pd.DataFrame(
            test_response.toarray(),
            columns=sklearn_transformer.get_feature_names_out(),
            index=test_model_input.index
    )

In [None]:
type(test_predictor)

In [None]:
test_transformed_recipe