# | default_exp core

In [None]:
# | hide
# from bertopic import BERTopic
# from bertopic.vectorizers import OnlineCountVectorizer
import dagshub
from datetime import datetime
import dill as pickle
import dvc.api
# from hdbscan import HDBSCAN
from itertools import tee, islice, product
import joblib
import nbdev
from nbdev.showdoc import *
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import (
    CountVectorizer
    , TfidfTransformer
    , TfidfVectorizer
    , 
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer
import src.dataframe_preprocessor as dfpp
import stanza
from tqdm import tqdm
# from umap import UMAP

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-04-03 18:58:24 INFO: Downloading default packages for language: en (English) ...
2024-04-03 18:58:24 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2024-04-03 18:58:27 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2024-04-03 18:58:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-04-03 18:58:28 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2024-04-03 18:58:28 INFO: Using device: cpu
2024-04-03 18:58:28 INFO: Loading: tokenize
2024-04-03 18:58:28 INFO: Loading: pos
2024-04-03 18:58:28 INFO: Loading: lemma
2024-04-03 18:58:29 INFO: Loading: constituency
2024-04-03 18:58:29 INFO: Loading: depparse
2024-04-03 18:58:29 INFO: Loading: sentiment
2024-04-03 18:58:29 INFO: Loading: ner
2024-04-03 18:58:30 INFO: Done loading processors!


In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'

# | export

In [None]:
# | hide
# nbdev.nbdev_export()

### Data Preparation

In [None]:
# instantiate stanza pipeline
stanza.download('en')
nlp = stanza.Pipeline('en', 
                    depparse_batch_size=50, 
                    depparse_min_length_to_batch_separately=50,
                    verbose=True,
                    use_gpu=True, # set to true when on cloud/not on streaming computer
                    batch_size=100
                    )


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-04-03 18:58:30 INFO: Downloading default packages for language: en (English) ...
2024-04-03 18:58:31 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2024-04-03 18:58:34 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2024-04-03 18:58:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-04-03 18:58:35 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2024-04-03 18:58:35 INFO: Using device: cuda
2024-04-03 18:58:35 INFO: Loading: tokenize
2024-04-03 18:58:38 INFO: Loading: pos
2024-04-03 18:58:38 INFO: Loading: lemma
2024-04-03 18:58:38 INFO: Loading: constituency
2024-04-03 18:58:39 INFO: Loading: depparse
2024-04-03 18:58:39 INFO: Loading: sentiment
2024-04-03 18:58:39 INFO: Loading: ner
2024-04-03 18:58:40 INFO: Done loading processors!


In [None]:
# load raw data and preprocess/clean
data = dvc.api.read(
    path='../data/recipes-en-201706/epicurious-recipes_m2.json'
    , mode='r')
raw_df = pd.read_json(data)
print('\n')
print('--------------')
print('Raw Dataframe:', end='\n')
print(raw_df.head())
print(raw_df.shape)



--------------
Raw Dataframe:
                         id  \
0  54a2b6b019925f464b373351   
1  54a408a019925f464b3733bc   
2  54a408a26529d92b2c003631   
3  54a408a66529d92b2c003638   
4  54a408a719925f464b3733cc   

                                                 dek  \
0  How does fried chicken achieve No. 1 status? B...   
1                                Spinaci all'Ebraica   
2  This majestic, moist, and richly spiced honey ...   
3  The idea for this sandwich came to me when my ...   
4  In 1930, Simon Agranat, the chief justice of t...   

                                     hed                   pubDate  \
0            Pickle-Brined Fried Chicken  2014-08-19T04:00:00.000Z   
1                   Spinach Jewish Style  2008-09-09T04:00:00.000Z   
2                  New Year’s Honey Cake  2008-09-10T04:00:00.000Z   
3  The B.L.A.Bagel with Lox and Avocado  2008-09-08T04:00:00.000Z   
4        Shakshuka a la Doktor Shakshuka  2008-09-09T04:00:00.000Z   

                       

In [None]:
# take sample and train/test split 
subset_df = raw_df.sample(n=100, random_state=45)
train_df, test_df = train_test_split(subset_df,test_size=0.5, random_state=45)

# pre_proc_df is cleaned dataframe
to_nlp_df = dfpp.preprocess_dataframe(train_df)
print('\n')
print('--------------')
print('Preprocessed Dataframe:', end='\n')
print(to_nlp_df.head())
print(to_nlp_df.shape)



--------------
Preprocessed Dataframe:
                                                                        dek  \
id                                                                            
54a4270b19925f464b37c1dc                                                      
54a42cde19925f464b3809d2  Green chiles pickled in soy sauce and vinegar ...   
54a433036529d92b2c015de3  This soup features the flavors of India: aroma...   
54a451926529d92b2c01eda8                                                      
54a430876529d92b2c013e2b  Brown sugar and molasses are balanced by fresh...   

                                                                        hed  \
id                                                                            
54a4270b19925f464b37c1dc  Grilled Hearts of Romaine with Blue Cheese Vin...   
54a42cde19925f464b3809d2                              Soy-Pickled Jalapeños   
54a433036529d92b2c015de3  Curried Potato and Spinach Soup with Onion Sal...   
54a4519265

In [None]:
# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': CustomSKLearnAnalyzer().ngram_maker(
        min_ngram_length=1,
        max_ngram_length=4,
        ),
    'min_df':3,
    # 'binary':False
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = to_nlp_df['ingredients_lemmafied']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)

print(transformed_recipe.columns)

fit_transform start: 2024-04-03 19:00:04.028764


100%|██████████| 50/50 [00:00<00:00, 8700.07it/s]


fit_transform end: 2024-04-03 19:00:04.049144
Index(['English', 'English hothouse', 'English hothouse cucumber', 'available',
       'baby', 'baking', 'baking powder', 'bay', 'bay leave', 'beef',
       ...
       'white', 'white vinegar', 'white wine', 'white wine vinegar', 'whole',
       'wine', 'wine vinegar', 'yukon', 'yukon gold', 'yukon gold potato'],
      dtype='object', length=283)


In [None]:
transformed_recipe

Unnamed: 0_level_0,English,English hothouse,English hothouse cucumber,available,baby,baking,baking powder,bay,bay leave,beef,...,white,white vinegar,white wine,white wine vinegar,whole,wine,wine vinegar,yukon,yukon gold,yukon gold potato
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54a4270b19925f464b37c1dc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149995,0.149995,0.0,...,0.188155,0.0,0.28111,0.28111,0.0,0.241343,0.252641,0.0,0.0,0.0
54a42cde19925f464b3809d2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a433036529d92b2c015de3,0.0,0.0,0.0,0.0,0.089666,0.0,0.0,0.089666,0.089666,0.0,...,0.056239,0.0,0.0,0.0,0.0,0.0,0.0,0.089666,0.089666,0.089666
54a451926529d92b2c01eda8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.478039,0.0,0.0,0.0,0.0,0.0
54a430876529d92b2c013e2b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.075879,0.12098,0.0,0.0,0.0,0.097329,0.0,0.0,0.0,0.0
54a453df6529d92b2c020687,0.0,0.0,0.0,0.0,0.0,0.134496,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.115469,0.0,0.0,0.0,0.0,0.0
55b0e7116284773353bf4580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a42bab6529d92b2c00ffa7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54a4748f19925f464b399ef2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.097141,0.0,0.0,0.0,0.0,0.0,0.0,0.154879,0.154879,0.154879
54a4356a19925f464b3875bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.09846,0.0,0.147102,0.147102,0.126292,0.126292,0.132204,0.0,0.0,0.0


In [None]:
transformed_recipe.columns.tolist()

['English',
 'English hothouse',
 'English hothouse cucumber',
 'available',
 'baby',
 'baking',
 'baking powder',
 'bay',
 'bay leave',
 'beef',
 'bell',
 'bell pepper',
 'black',
 'black pepper',
 'bread',
 'brk',
 'broth',
 'brown',
 'brown sugar',
 'bunch',
 'butter',
 'can',
 'carrot',
 'cayenne',
 'cayenne pepper',
 'celery',
 'cheese',
 'cherry',
 'chicken',
 'chile',
 'chop',
 'chop fresh',
 'chop fresh cilantro',
 'chop onion',
 'chop red',
 'cilantro',
 'cinnamon',
 'clove',
 'clove mince',
 'coarse',
 'coarse kosher',
 'coarse kosher salt',
 'core',
 'core cut',
 'core cut inch',
 'coriander',
 'cream',
 'crosswise',
 'cube',
 'cucumber',
 'cumin',
 'cup',
 'cup chop',
 'cup chop onion',
 'cup dry',
 'cup fresh',
 'cup grate',
 'cup olive',
 'cup olive oil',
 'cup pack',
 'cup purpose',
 'cup purpose flour',
 'cup slice',
 'cup sour',
 'cup sour cream',
 'cup stick',
 'cup sugar',
 'cup tablespoon',
 'cup water',
 'curry',
 'curry powder',
 'cut',
 'cut inch',
 'cut inch cub

In [None]:
to_nlp_df.head()

Unnamed: 0_level_0,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,ingredients_lemmafied,cuisine_name,photo_filename,photo_credit,author_name,date_published,recipe_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
54a4270b19925f464b37c1dc,,Grilled Hearts of Romaine with Blue Cheese Vin...,3.64,"[1 1/2 cups white wine vinegar, 1/2 cup sugar,...",[Combine first 5 ingredients and 1/4 teaspoon ...,9,100,cup white wine vinegar brk cup sugar brk cup w...,Missing Cuisine,EP_12162015_placeholders_casual.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Kate Higgins,2010-12-16 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a42cde19925f464b3809d2,Green chiles pickled in soy sauce and vinegar ...,Soy-Pickled Jalapeños,3.43,"[3 large fresh jalapeños (4 inches), sliced 1/...",[Combine all ingredients in a small heavy sauc...,6,100,large fresh jalapeño inch slice inch thick brk...,Missing Cuisine,EP_12162015_placeholders_rustic.jpg,"Photo by Chelsea Kyle, Prop Styling by Anna St...",Lillian Chou,2009-02-19 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a433036529d92b2c015de3,This soup features the flavors of India: aroma...,Curried Potato and Spinach Soup with Onion Sal...,3.0,"[4 cups chopped red onions (about 2 large), 1 ...",[Combine first 5 ingredients in heavy medium s...,6,67,cup chop red onion large brk tablespoon sunflo...,Indian,234125.jpg,Brian Leatart,Peter Gordon,2006-03-07 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a451926529d92b2c01eda8,,Chicken Soup,3.19,"[1 pound chicken parts, 2 stalks celery, inclu...",[1. Pour 12 cups of cold water into a large st...,32,87,pound chicken part brk stalk celery include le...,Kosher,EP_12162015_placeholders_formal.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Sharon Lebewohl,2004-08-20 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a430876529d92b2c013e2b,Brown sugar and molasses are balanced by fresh...,Sweet-Hot Barbecue Sauce,0.0,"[2 tablespoons olive oil, 1 cup chopped onion,...",[Heat oil in large saucepan over medium-high h...,0,0,tablespoon olive oil brk cup chop onion brk cu...,Missing Cuisine,EP_12162015_placeholders_rustic.jpg,"Photo by Chelsea Kyle, Prop Styling by Anna St...",Suzanne Tracht,2007-12-03 20:11:11+00:00,https://www.epicurious.com/recipes/food/views/...


In [None]:
# Prepare whole dataframe for new processing
import mlflow
from mlflow.models import infer_signature
from src.custom_stanza_mlflow import CustomSKLearnWrapper

In [None]:
# this function allows us to get the experiment ID from an experiment name
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

#@markdown Enter the repo name 
DAGSHUB_REPO_NAME = "MeaLeon"

#@markdown Enter the name of the branch you are working on 
BRANCH = "NGRAM-1/try-llm-code-speedup"
dagshub.init(repo_name=DAGSHUB_REPO_NAME
             , repo_owner=DAGSHUB_USER_NAME)

## Starting DEV stage for TFIDF Encoded model

In [None]:
mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# starter idea for making an experiment name can be the git branch, but need more specificity
experiment_name = f"{DAGSHUB_EMAIL}/OHE_up_to_quadgrams"
mlflow_exp_id = get_experiment_id(experiment_name)

# define model location
# model_directory = "/tmp/sklearn_model"
model_directory = "../models/sklearn_model"

# Define the required artifacts associated with the saved custom pyfunc
# sklearn_path = model_directory + ""
sklearn_model_path = model_directory + "/python_model.pkl"
sklearn_transformer_path = model_directory + "/sklearn_transformer.pkl"
transformed_recipes_path = model_directory + "/transformed_recipes.pkl"
combined_df_path = model_directory + "/combined_df.pkl"

artifacts = {'sklearn_model': sklearn_model_path,
             'sklearn_transformer': sklearn_transformer_path,
             'transformed_recipes': transformed_recipes_path,
            #  'combined_data': combined_df_path
             }


In [None]:
# pre_proc_df is cleaned dataframe
print("Preprocess start: " + str(datetime.now()))
whole_nlp_df = dfpp.preprocess_dataframe(raw_df)
print("Preprocess end: " + str(datetime.now()))
print('\n')
print('--------------')
print('Preprocessed Dataframe: ', end='\n')
print(whole_nlp_df.head())
print(whole_nlp_df.shape)

In [None]:
whole_nlp_df.to_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip', 
                        compression='gzip',
                        index=True)

In [None]:
whole_nlp_df = pd.read_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip')
whole_nlp_df.head()

Unnamed: 0_level_0,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,ingredients_lemmafied,cuisine_name,photo_filename,photo_credit,author_name,date_published,recipe_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
54a2b6b019925f464b373351,How does fried chicken achieve No. 1 status? B...,Pickle-Brined Fried Chicken,3.11,"[1 tablespoons yellow mustard seeds, 1 tablesp...",[Toast mustard and coriander seeds in a dry me...,7,100,tablespoon yellow mustard seed brk tablespoon ...,Missing Cuisine,51247610_fried-chicken_1x1.jpg,Michael Graydon and Nikole Herriott,Missing Author Name,2014-08-19 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a019925f464b3733bc,Spinaci all'Ebraica,Spinach Jewish Style,3.22,"[3 pounds small-leaved bulk spinach, Salt, 1/2...",[Remove the stems and roots from the spinach. ...,5,80,pound small leave bulk spinach brk salt brk cu...,Italian,EP_12162015_placeholders_rustic.jpg,"Photo by Chelsea Kyle, Prop Styling by Anna St...",Edda Servi Machlin,2008-09-09 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a26529d92b2c003631,"This majestic, moist, and richly spiced honey ...",New Year’s Honey Cake,3.62,"[3 1/2 cups all-purpose flour, 1 tablespoon ba...",[I like this cake best baked in a 9-inch angel...,105,88,cup purpose flour brk tablespoon baking powder...,Kosher,EP_09022015_honeycake-2.jpg,"Photo by Chelsea Kyle, Food Styling by Anna St...",Marcy Goldman,2008-09-10 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a66529d92b2c003638,The idea for this sandwich came to me when my ...,The B.L.A.Bagel with Lox and Avocado,4.0,"[1 small ripe avocado, preferably Hass (see No...","[A short time before serving, mash avocado and...",7,100,small ripe avocado hass see note brk teaspoon ...,Kosher,EP_12162015_placeholders_casual.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Faye Levy,2008-09-08 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a719925f464b3733cc,"In 1930, Simon Agranat, the chief justice of t...",Shakshuka a la Doktor Shakshuka,2.71,"[2 pounds fresh tomatoes, unpeeled and cut in ...","[1. Place the tomatoes, garlic, salt, paprika,...",7,83,pound fresh tomato unpeeled cut quarter ounce ...,Kosher,EP_12162015_placeholders_formal.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Joan Nathan,2008-09-09 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...


In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': CustomSKLearnAnalyzer().ngram_maker(
        min_ngram_length=1,
        max_ngram_length=4,
        ),
    'min_df':3,
    'binary':True
}

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'OHE'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_transformer_params)

with mlflow.start_run(experiment_id=mlflow_exp_id):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    # Will be useful in STAGING/Evaluation
    
    # LOG MODEL
    # Instantiate sklearn OneHotEncoder
    sklearn_transformer = CountVectorizer(**sklearn_transformer_params)

    print('\n')
    print('-' * 80)
    print('sklearn fit transform on ingredients:', end='\n')

    model_input = whole_nlp_df['ingredients_lemmafied']

    print('\n')
    print('-' * 80)
    print('Input Data: ', end='\n')
    print(model_input)

    print('\n')
    print('-' * 80)
    print('Input Data Shape: ', end='\n')
    print(model_input.shape)

    print('\n')
    print('-' * 80)
    print('Random 3 Records from Input Data: ', end='\n')
    print(model_input.sample(3, random_state=200))

    # Do fit transform on data
    response = sklearn_transformer.fit_transform(tqdm(model_input)) 
    
    transformed_recipe = pd.DataFrame(
            response.toarray(),
            columns=sklearn_transformer.get_feature_names_out(),
            index=model_input.index
    )

    signature = infer_signature(model_input=model_input,
                                model_output=transformed_recipe
                                )

    print('\n')
    print('-' * 80)
    print('Transformed Data:', end='\n')
    print(transformed_recipe.head())
    
    # mlflow.pyfunc.save_model(
    #     path=model_directory,
    #     code_path=["../src/"],
    #     python_model=CustomSKLearnWrapper(),
    #     input_example=to_nlp_df['ingredients'][0],    
    #     artifacts=artifacts
    # )

    # combined_df = pd.concat(
    #     [transformed_recipe,
    #      whole_nlp_df
    #      ]
    #     , axis=1)
    # print('\n')
    # print('-' * 80)
    # print('Combined Data:', end='\n')
    # print(combined_df.head())

    with open(sklearn_transformer_path, "wb") as fo:
        pickle.dump(sklearn_transformer, fo)
    
    with open(transformed_recipes_path, "wb") as fo:
        pickle.dump(transformed_recipe, fo)
    
    # with open(combined_df_path, 'wb') as fo:
    #     pickle.dump(combined_df, fo)


    model_info = mlflow.pyfunc.log_model( 
        code_path=["../src/"],
        python_model=CustomSKLearnWrapper(),
        input_example=whole_nlp_df['ingredients_lemmafied'][0],
        signature=signature,        
        artifact_path="sklearn_model",
        artifacts=artifacts
        ) 

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    



--------------------------------------------------------------------------------
sklearn fit transform on ingredients:


--------------------------------------------------------------------------------
Input Data: 
id
54a2b6b019925f464b373351    tablespoon yellow mustard seed brk tablespoon ...
54a408a019925f464b3733bc    pound small leave bulk spinach brk salt brk cu...
54a408a26529d92b2c003631    cup purpose flour brk tablespoon baking powder...
54a408a66529d92b2c003638    small ripe avocado hass see note brk teaspoon ...
54a408a719925f464b3733cc    pound fresh tomato unpeeled cut quarter ounce ...
                                                  ...                        
59541a31bff3052847ae2107    tablespoon unsalt butter room temperature brk ...
5954233ad52ca90dc28200e7    tablespoon stick salt butter room temperature ...
595424c2109c972493636f83    tablespoon unsalted butter more greasing pan b...
5956638625dc3d1d829b7166    coarse salt brk lime wedge brk ounce tomato ju...


100%|██████████| 34756/34756 [00:02<00:00, 11734.80it/s]
  outputs = _infer_schema(model_output) if model_output is not None else None




--------------------------------------------------------------------------------
Transformed Data:
                          100g  125g  13x9x2  150g  1pound  1tablespoon  \
id                                                                        
54a2b6b019925f464b373351     0     0       0     0       0            0   
54a408a019925f464b3733bc     0     0       0     0       0            0   
54a408a26529d92b2c003631     0     0       0     0       0            0   
54a408a66529d92b2c003638     0     0       0     0       0            0   
54a408a719925f464b3733cc     0     0       0     0       0            0   

                          1teaspoon  200g  250g  2cup  ...  árbol divide  \
id                                                     ...                 
54a2b6b019925f464b373351          0     0     0     0  ...             0   
54a408a019925f464b3733bc          0     0     0     0  ...             0   
54a408a26529d92b2c003631          0     0     0     0  ...           

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



MlflowException: API request to https://dagshub.com/AaronWChen/MeaLeon.mlflow/api/2.0/mlflow-artifacts/artifacts/ad83ec0a104a44b5a16da48605603245/2e72322335494f84af9ff7e7e44c3ff9/artifacts/sklearn_model/artifacts/transformed_recipes.pkl failed with exception HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /AaronWChen/MeaLeon.mlflow/api/2.0/mlflow-artifacts/artifacts/ad83ec0a104a44b5a16da48605603245/2e72322335494f84af9ff7e7e44c3ff9/artifacts/sklearn_model/artifacts/transformed_recipes.pkl (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2396)')))

In [None]:
# pre_proc_df is cleaned dataframe
whole_nlp_df = dfpp.preprocess_dataframe(raw_df)
print('\n')
print('--------------')
print('Preprocessed Dataframe:', end='\n')
print(whole_nlp_df.head())
print(whole_nlp_df.shape)

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': CustomSKLearnAnalyzer().ngram_maker(
        min_ngram_length=1,
        max_ngram_length=4,
        ),
    'min_df':3,
}

sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

model_input = whole_nlp_df['ingredients_lemmafied']

# Do fit transform on data
print("fit_transform start: " + str(datetime.now()))
response = sklearn_transformer.fit_transform(tqdm(model_input)) 
print("fit_transform end: " + str(datetime.now()))

transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index
)

combined_df = pd.concat([transformed_recipe, whole_nlp_df], axis=1)

with open("../joblib/2024.03.19/combined_df.joblib", 'wb') as fo:
    joblib.dump()


In [None]:
response

In [None]:
test_predictor = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)

In [None]:
# pre_proc_df is cleaned dataframe
pre_proc_test_df = dfpp.preprocess_dataframe(test_df)
print('\n')
print('--------------')
print('Preprocessed Dataframe: ', end='\n')
print(pre_proc_test_df.head())
print(pre_proc_test_df.shape)

# create subset for dev purposes
# to_nlp_test_df = pre_proc_test_df
# print('\n')
# print('-' * 80)
# print('Subset Dataframe:', end='\n')
# print(to_nlp_test_df.head())
# print(to_nlp_test_df.shape)

test_model_input = pre_proc_test_df['ingredients']

In [None]:
test_model_input

In [None]:
test_model_input.shape

In [None]:
test_model_input.values

In [None]:
model_info.signature.to_dict()

In [None]:
test_predictor.predict(test_model_input)

In [None]:
print('\n')
print('-' * 80)
print('Input Data: ', end='\n')
print(test_model_input)

print('\n')
print('-' * 80)
print('Input Data Shape: ', end='\n')
print(test_model_input.shape)

print('\n')
print('-' * 80)
print('Random 3 Records from Input Data: ', end='\n')
print(test_model_input.sample(3, random_state=200))

# test_response = sklearn_transformer.transform(tqdm(test_model_input)) 
test_response = sklearn_transformer.transform(test_model_input)
    
    
test_transformed_recipe = pd.DataFrame(
            test_response.toarray(),
            columns=sklearn_transformer.get_feature_names_out(),
            index=test_model_input.index
    )

In [None]:
type(test_predictor)

In [None]:
test_transformed_recipe