# | default_exp core

In [None]:
# | hide
# from bertopic import BERTopic
# from bertopic.vectorizers import OnlineCountVectorizer
import dagshub
from datetime import datetime
import dill as pickle
import dvc.api
# from hdbscan import HDBSCAN
from itertools import tee, islice, product
import joblib
import nbdev
from nbdev.showdoc import *
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import (
    CountVectorizer
    , TfidfTransformer
    , TfidfVectorizer
    , 
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from src.custom_sklearn_text_transformer_mlflow import CustomSKLearnAnalyzer
import src.dataframe_preprocessor as dfpp
import stanza
from tqdm import tqdm
# from umap import UMAP

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-06-12 17:44:28 INFO: Downloading default packages for language: en (English) ...
2024-06-12 17:44:29 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2024-06-12 17:44:32 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2024-06-12 17:44:32 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-06-12 17:44:33 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2024-06-12 17:44:33 INFO: Using device: cpu
2024-06-12 17:44:33 INFO: Loading: tokenize
2024-06-12 17:44:33 INFO: Loading: pos
2024-06-12 17:44:33 INFO: Loading: lemma
2024-06-12 17:44:34 INFO: Loading: constituency
2024-06-12 17:44:34 INFO: Loading: depparse
2024-06-12 17:44:34 INFO: Loading: sentiment
2024-06-12 17:44:34 INFO: Loading: ner
2024-06-12 17:44:35 INFO: Done loading processors!


In [None]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128'

# | export

In [None]:
# | hide
# nbdev.nbdev_export()

### Data Preparation

In [None]:
# instantiate stanza pipeline
stanza.download('en')
nlp = stanza.Pipeline('en', 
                    depparse_batch_size=50, 
                    depparse_min_length_to_batch_separately=50,
                    verbose=True,
                    use_gpu=True, # set to true when on cloud/not on streaming computer
                    batch_size=100
                    )


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-06-12 17:44:35 INFO: Downloading default packages for language: en (English) ...
2024-06-12 17:44:36 INFO: File exists: /home/awchen/stanza_resources/en/default.zip
2024-06-12 17:44:39 INFO: Finished downloading models and saved to /home/awchen/stanza_resources.
2024-06-12 17:44:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2024-06-12 17:44:40 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2024-06-12 17:44:40 INFO: Using device: cuda
2024-06-12 17:44:40 INFO: Loading: tokenize
2024-06-12 17:44:44 INFO: Loading: pos
2024-06-12 17:44:44 INFO: Loading: lemma
2024-06-12 17:44:44 INFO: Loading: constituency
2024-06-12 17:44:45 INFO: Loading: depparse
2024-06-12 17:44:45 INFO: Loading: sentiment
2024-06-12 17:44:45 INFO: Loading: ner
2024-06-12 17:44:46 INFO: Done loading processors!


In [None]:
# Prepare whole dataframe for new processing
import mlflow
from mlflow.models import infer_signature
from src.custom_stanza_mlflow import CustomSKLearnWrapper

In [None]:
# this function allows us to get the experiment ID from an experiment name
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id

In [None]:
#@markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu" #@param {type:"string"}

#@markdown Enter the repo name 
DAGSHUB_REPO_NAME = "MeaLeon"

#@markdown Enter the name of the branch you are working on 
BRANCH = "NGRAM-2/trying-sklearn-object-upload"
dagshub.init(repo_name=DAGSHUB_REPO_NAME
             , repo_owner=DAGSHUB_USER_NAME)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=0bcb5e0b-d1d1-4e4c-a5af-e4c5220ed6ac&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=0d2c3668329e1c690d7f0041d56f955cb1ed6ca7ad0549863e844e052d0ed92b




Output()

## Starting DEV stage for TFIDF Encoded model

In [None]:
mlflow.set_tracking_uri(f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# starter idea for making an experiment name can be the git branch, but need more specificity
experiment_name = f"{DAGSHUB_EMAIL}/TFIDF_up_to_quadgrams_small_sample_upload_test"
mlflow_exp_id = get_experiment_id(experiment_name)

# define model location
# model_directory = "/tmp/sklearn_model"
model_directory = "../models/sklearn_model"

# Define the required artifacts associated with the saved custom pyfunc
# sklearn_path = model_directory + ""
sklearn_model_path = model_directory + "/python_model.pkl"
sklearn_transformer_path = model_directory + "/sklearn_transformer.pkl"
transformed_recipes_path = model_directory + "/transformed_recipes.pkl"
transformed_recipes_parquet_path = model_directory + "/transformed_recipes.parquet"
combined_df_path = model_directory + "/combined_df.pkl"

artifacts = {'sklearn_model': sklearn_model_path,
             'sklearn_transformer': sklearn_transformer_path,
            #  'transformed_recipes': transformed_recipes_path,
             'combined_data': combined_df_path
             }


In [None]:
whole_nlp_df = pd.read_parquet('../joblib/2024.03.19/pre_proc_df.parquet.gzip')
whole_nlp_df.head()

Unnamed: 0_level_0,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,ingredients_lemmafied,cuisine_name,photo_filename,photo_credit,author_name,date_published,recipe_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
54a2b6b019925f464b373351,How does fried chicken achieve No. 1 status? B...,Pickle-Brined Fried Chicken,3.11,"[1 tablespoons yellow mustard seeds, 1 tablesp...",[Toast mustard and coriander seeds in a dry me...,7,100,tablespoon yellow mustard seed brk tablespoon ...,Missing Cuisine,51247610_fried-chicken_1x1.jpg,Michael Graydon and Nikole Herriott,Missing Author Name,2014-08-19 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a019925f464b3733bc,Spinaci all'Ebraica,Spinach Jewish Style,3.22,"[3 pounds small-leaved bulk spinach, Salt, 1/2...",[Remove the stems and roots from the spinach. ...,5,80,pound small leave bulk spinach brk salt brk cu...,Italian,EP_12162015_placeholders_rustic.jpg,"Photo by Chelsea Kyle, Prop Styling by Anna St...",Edda Servi Machlin,2008-09-09 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a26529d92b2c003631,"This majestic, moist, and richly spiced honey ...",New Year’s Honey Cake,3.62,"[3 1/2 cups all-purpose flour, 1 tablespoon ba...",[I like this cake best baked in a 9-inch angel...,105,88,cup purpose flour brk tablespoon baking powder...,Kosher,EP_09022015_honeycake-2.jpg,"Photo by Chelsea Kyle, Food Styling by Anna St...",Marcy Goldman,2008-09-10 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a66529d92b2c003638,The idea for this sandwich came to me when my ...,The B.L.A.Bagel with Lox and Avocado,4.0,"[1 small ripe avocado, preferably Hass (see No...","[A short time before serving, mash avocado and...",7,100,small ripe avocado hass see note brk teaspoon ...,Kosher,EP_12162015_placeholders_casual.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Faye Levy,2008-09-08 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a719925f464b3733cc,"In 1930, Simon Agranat, the chief justice of t...",Shakshuka a la Doktor Shakshuka,2.71,"[2 pounds fresh tomatoes, unpeeled and cut in ...","[1. Place the tomatoes, garlic, salt, paprika,...",7,83,pound fresh tomato unpeeled cut quarter ounce ...,Kosher,EP_12162015_placeholders_formal.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Joan Nathan,2008-09-09 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...


In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f'https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow')

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {    
    'analyzer': CustomSKLearnAnalyzer().ngram_maker(
        min_ngram_length=1,
        max_ngram_length=4,
        ),
    'min_df':3,
    'binary':False
}

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {
    'stanza_model': 'en',
    'sklearn-transformer': 'TFIDF'
}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_transformer_params)

with mlflow.start_run(experiment_id=mlflow_exp_id):    
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    # Will be useful in STAGING/Evaluation
    
    # LOG MODEL
    # Instantiate sklearn TFIDFVectorizer
    sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

    print('\n')
    print('-' * 80)
    print('sklearn fit transform on ingredients:', end='\n')

    model_input = whole_nlp_df['ingredients_lemmafied']

    print('\n')
    print('-' * 80)
    print('Input Data: ', end='\n')
    print(model_input)

    print('\n')
    print('-' * 80)
    print('Input Data Shape: ', end='\n')
    print(model_input.shape)

    random_sample = model_input.sample(3, random_state=200)

    print('\n')
    print('-' * 80)
    print('Random 3 Records from Input Data: ', end='\n')
    print(random_sample)

    # Do fit transform on data
    response = sklearn_transformer.fit_transform(tqdm(model_input)) 
    
    transformed_recipe = pd.DataFrame(
            response.toarray(),
            columns=sklearn_transformer.get_feature_names_out(),
            index=model_input.index
    )

    signature = infer_signature(model_input=model_input,
                                model_output=transformed_recipe
                                )

    print('\n')
    print('-' * 80)
    print('Transformed Data:', end='\n')
    print(transformed_recipe.head())
    
    combined_df = transformed_recipe.join(random_sample, how='inner')

    print('\n')
    print('-' * 80)
    print('Random Sample of Combined Data:', end='\n')
    print(combined_df.head())

    with open(sklearn_transformer_path, "wb") as fo:
        pickle.dump(sklearn_transformer, fo)
    
    with open(transformed_recipes_path, "wb") as fo:
        pickle.dump(transformed_recipe, fo)

    transformed_recipe.to_parquet(path=transformed_recipes_parquet_path)
    
    with open(combined_df_path, 'wb') as fo:
        pickle.dump(combined_df, fo)


    model_info = mlflow.pyfunc.log_model( 
        code_path=["../src/"],
        python_model=CustomSKLearnWrapper(),
        input_example=whole_nlp_df['ingredients_lemmafied'][0],
        signature=signature,        
        artifact_path="sklearn_model",
        artifacts=artifacts
        ) 

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel
    



--------------------------------------------------------------------------------
sklearn fit transform on ingredients:


--------------------------------------------------------------------------------
Input Data: 
id
54a2b6b019925f464b373351    tablespoon yellow mustard seed brk tablespoon ...
54a408a019925f464b3733bc    pound small leave bulk spinach brk salt brk cu...
54a408a26529d92b2c003631    cup purpose flour brk tablespoon baking powder...
54a408a66529d92b2c003638    small ripe avocado hass see note brk teaspoon ...
54a408a719925f464b3733cc    pound fresh tomato unpeeled cut quarter ounce ...
                                                  ...                        
59541a31bff3052847ae2107    tablespoon unsalt butter room temperature brk ...
5954233ad52ca90dc28200e7    tablespoon stick salt butter room temperature ...
595424c2109c972493636f83    tablespoon unsalted butter more greasing pan b...
5956638625dc3d1d829b7166    coarse salt brk lime wedge brk ounce tomato ju...


100%|██████████| 34756/34756 [00:03<00:00, 10809.07it/s]




--------------------------------------------------------------------------------
Transformed Data:
                          100g  125g  13x9x2  150g  1pound  1tablespoon  \
id                                                                        
54a2b6b019925f464b373351   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a019925f464b3733bc   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a26529d92b2c003631   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a66529d92b2c003638   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a719925f464b3733cc   0.0   0.0     0.0   0.0     0.0          0.0   

                          1teaspoon  200g  250g  2cup  ...  árbol divide  \
id                                                     ...                 
54a2b6b019925f464b373351        0.0   0.0   0.0   0.0  ...           0.0   
54a408a019925f464b3733bc        0.0   0.0   0.0   0.0  ...           0.0   
54a408a26529d92b2c003631        0.0   0.0   0.0   0.0  ...           

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

