# | default_exp testing

In [None]:
# | hide
import dagshub
import dill as pickle
import joblib
import mlflow
from mlflow.models import infer_signature
import nbdev  # ; nbdev.nbdev_export()
from nbdev.showdoc import *
import pandas as pd
import re
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from src.backend.embedding_creation.apply_stanza import CustomSKLearnAnalyzer
from src.backend.embedding_creation.sklearn_transformer_as_mlflow_model import (
    CustomSKLearnWrapper,
)
import src.backend.raw_data_cleaning.raw_data_preprocessor as rdpp
import stanza
from tqdm import tqdm

## Need to call DAGsHub to keep track of what we're doing

In [None]:
# @markdown Enter the username of your DAGsHub account:
DAGSHUB_USER_NAME = "AaronWChen"  # @param {type:"string"}

# @markdown Enter the email for your DAGsHub account:
DAGSHUB_EMAIL = "awc33@cornell.edu"  # @param {type:"string"}

# @markdown Enter the repo name
DAGSHUB_REPO_NAME = "MeaLeon"

# @markdown Enter the name of the branch you are working on
BRANCH = "init_mealeon_to_notebook_refactor"
dagshub.init(repo_name=DAGSHUB_REPO_NAME, repo_owner=DAGSHUB_USER_NAME)

## Things I need to do

1. app.py calls find_similar_dishes, returns a render template
2. find_similar_dishes needs to call the recipe database, the sklearn model, the model-transformed database (ie, TFIDF word matrix), and the query (which needs to be transformed)
   1. Little confused by order; why would i need the original database if i can just call the model/vector-transformed version?
      1. Original database has things like url and ID, which could be needed later
      2. ~~Future vector data can use the same recipe_id unique key, but only have the ingredient vectors. Use unique key to join original...~~
      3. Wait, need cuisine filter to improve search results...so vector database should have cuisine and recipe_id
      4. From that, can call back to original database to get URLs and other metadata
         1. SQLModel query to join
   2. Sklearn model (really any model that transforms the query) needs to be loaded from MLflow
      1. Model will be used to transform query for similarity analysis
      2. MLflow load
   3. Vector database needs to be loaded from currently a json, but should switch to Vespa
      1. Wouldn't this need to be linked to the MLflow Model? DVC + Vespa?
      2. Mlflow or DVC load?
   4. Original recipe database might also be DVC?
   5. 
3. original query should be formatted and stored into recipe database (CRUD)
4. this is called to edamam API
5. edamam return is currently model-transformed then cuisine filtered
   1. Swap this order so we don't have to process as much text
6. Vector comparison against filtered data

### Data Preparation

This part can be the DVC import for our data

Currently, raw/processed data can be imported with json, need to consider how to access data something like SQL and log some snapshot of this data (and its metadata?) with DVC

- Can i reuse some parts of GitHub Actions?

- DVC can handle data files fine, but SQL pulls are currently experimentally supported
- using dvc import-db https://dvc.org/doc/command-reference/import-db

- DVC with generative AI (might be relevant to vector databases): https://youtu.be/aqMXEvWTuVY?si=2lMKrofl9s10BXVx

#### Let's start with local data files

Via automated ETL, DVC could log the raw data, perform the text processing if not an embedding, add the pre processed data back to DVC, then start MLflow with embedding conversion 

In [None]:
# raw data

!dvc add "../data/raw/201706-epicurious-recipes-en.json"
raw_df = pd.read_json("../data/raw/201706-epicurious-recipes-en.json")

[?25l[32m⠋[0m Checking graph                                                 
Adding...                                                                       
![A
  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0[A
                                                                                [A
![A
  0%|          |Checking out ../data/raw/201706-epicur0/? [00:00<?,    ?files/s][A
  0%|          |Checking out ../data/raw/201706-epicur0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  4.23file/s][A

To track the changes with git, run:

	git add ../data.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [None]:
# ETL work (currently, data cleaning/prep)
# how the prep works is via dataframe_preprocessor
cleaned_df = rdpp.preprocess_dataframe(raw_df)
cleaned_df.to_parquet("../data/processed/cleaned_df.parquet.gzip", compression="gzip")

In [None]:
# add cleaned dataframe to DVC
!dvc add "../data/processed/cleaned_df.parquet.gzip"

[?25l[32m⠋[0m Checking graph                                                 
Adding...                                                                       
![A
  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0[A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out ../data/processed/cleaned0/? [00:00<?,    ?files/s][A
  0%|          |Checking out ../data/processed/cleaned0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 17.84file/s][A

To track the changes with git, run:

	git add ../data.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m

Need to commit DVC/data changes to git, does that need to be done in this cell?
- based off of the nbdev tools currently (where it essentially runs the whole notebook), this may not be a good idea
- when working out of a notebook for testing, dvc maybe can pull the data, but we should not be doing the actual processing here

In the future, can/should the data cleaning be done in dbt?

- no, dbt is more about analytics then data cleaning, it seems

- if text processing needed regularly, might have to put in Airflow

---

Now that we have converted the raw dataframe to a cleaner form with lemmatization (if needed/preferred) we can move on to the embedding transformation. Currently, this is another ETL done with `nlp_processor`, but performed with an MLflow model and this embedding transformed/vectorized data should then added back to DVC.

---

In the future, we can take the embeddings and convert them to PyTorch tensors/datasets, which is not something we can do with the original raw text

In [None]:
# | export
# this is a custom function to be used with MLflow to get or create experiments (is from the MLflow team)
def get_mlflow_experiment_id(name):
    # this function allows us to get the experiment ID from an experiment name
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
        exp_id = mlflow.create_experiment(name)
        return exp_id
    return exp.experiment_id

## Starting DEV stage for TFIDF Encoded model

In [None]:
mlflow.set_tracking_uri(f"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow")

# starter idea for making an experiment name, can be the git branch, but need more specificity
experiment_name = f"{DAGSHUB_EMAIL}/DVC-MLflow-integration-test"
mlflow_exp_id = get_mlflow_experiment_id(experiment_name)

# define processed data location and data to be added to DVC
processed_data_base = "../data/processed"
transformed_recipes_parquet_path = (
    processed_data_base + "/transformed_recipes.parquet.gzip"
)
combined_df_path = processed_data_base + "/combined_df.parquet.gzip"


# define model location
model_directory = "../models/sklearn_model"

# Define the required artifacts associated with the saved custom pyfunc
sklearn_model_path = model_directory + "/python_model.pkl"
sklearn_transformer_path = model_directory + "/sklearn_transformer.pkl"
# transformed_recipes_path = model_directory + "/transformed_recipes.pkl"
combined_df_sample_path = model_directory + "/combined_df_sample.parquet"

artifacts = {
    "sklearn_model": sklearn_model_path,
    "sklearn_transformer": sklearn_transformer_path,
    #  'transformed_recipes': transformed_recipes_path,
    #  'combined_data': combined_df_path,
    "combined_data_sample": combined_df_sample_path,
}

In [None]:
# Prepare whole dataframe for new processing
!dvc pull

  0% Checkout|                                      |0/27 [00:00<?,     ?file/s]
![A
Building data objects from ../joblib/2022.08.23       |0.00 [00:00,      ?obj/s][A
                                                                                [A
![A
Building data objects from ../data                    |0.00 [00:00,      ?obj/s][A
[33mM[0m       ..[35m/data/[0m                                              [A
[31mD[0m       data/raw/[1;36m201706[0m-epicurious-recipes-en.json
[31mD[0m       data/processed/cleaned_df.parquet.gzip
2 files deleted and 1 file modified
[0m

In [None]:
# this part can be done after a dvc pull
whole_nlp_df = pd.read_parquet("../data/processed/cleaned_df.parquet.gzip")
whole_nlp_df.head()

Unnamed: 0_level_0,dek,hed,aggregateRating,ingredients,prepSteps,reviewsCount,willMakeAgainPct,ingredients_lemmafied,cuisine_name,photo_filename,photo_credit,author_name,date_published,recipe_url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
54a2b6b019925f464b373351,How does fried chicken achieve No. 1 status? B...,Pickle-Brined Fried Chicken,3.11,"[1 tablespoons yellow mustard seeds, 1 tablesp...",[Toast mustard and coriander seeds in a dry me...,7,100,tablespoon yellow mustard seed brk tablespoon ...,Missing Cuisine,51247610_fried-chicken_1x1.jpg,Michael Graydon and Nikole Herriott,Missing Author Name,2014-08-19 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a019925f464b3733bc,Spinaci all'Ebraica,Spinach Jewish Style,3.22,"[3 pounds small-leaved bulk spinach, Salt, 1/2...",[Remove the stems and roots from the spinach. ...,5,80,pound small leave bulk spinach brk salt brk cu...,Italian,EP_12162015_placeholders_rustic.jpg,"Photo by Chelsea Kyle, Prop Styling by Anna St...",Edda Servi Machlin,2008-09-09 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a26529d92b2c003631,"This majestic, moist, and richly spiced honey ...",New Year’s Honey Cake,3.62,"[3 1/2 cups all-purpose flour, 1 tablespoon ba...",[I like this cake best baked in a 9-inch angel...,105,88,cup purpose flour brk tablespoon baking powder...,Kosher,EP_09022015_honeycake-2.jpg,"Photo by Chelsea Kyle, Food Styling by Anna St...",Marcy Goldman,2008-09-10 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a66529d92b2c003638,The idea for this sandwich came to me when my ...,The B.L.A.Bagel with Lox and Avocado,4.0,"[1 small ripe avocado, preferably Hass (see No...","[A short time before serving, mash avocado and...",7,100,small ripe avocado hass see note brk teaspoon ...,Kosher,EP_12162015_placeholders_casual.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Faye Levy,2008-09-08 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...
54a408a719925f464b3733cc,"In 1930, Simon Agranat, the chief justice of t...",Shakshuka a la Doktor Shakshuka,2.71,"[2 pounds fresh tomatoes, unpeeled and cut in ...","[1. Place the tomatoes, garlic, salt, paprika,...",7,83,pound fresh tomato unpeeled cut quarter ounce ...,Kosher,EP_12162015_placeholders_formal.jpg,"Photo by Chelsea Kyle, Prop Styling by Rhoda B...",Joan Nathan,2008-09-09 04:00:00+00:00,https://www.epicurious.com/recipes/food/views/...


In [None]:
# load from MLflow
mlflow_client = mlflow.tracking.MlflowClient(
    tracking_uri=f"https://dagshub.com/{DAGSHUB_USER_NAME}/MeaLeon.mlflow"
)

# cv_params are parameters for the sklearn CountVectorizer or TFIDFVectorizer
sklearn_transformer_params = {
    "analyzer": CustomSKLearnAnalyzer().ngram_maker(
        min_ngram_length=1,
        max_ngram_length=4,
    ),
    "min_df": 3,
    "binary": False,
}

# pipeline_params are parameters that will be logged in MLFlow and are a superset of library parameters
pipeline_params = {"stanza_model": "en", "sklearn-transformer": "TFIDF"}

# update the pipeline parameters with the library-specific ones so that they show up in MLflow Tracking
pipeline_params.update(sklearn_transformer_params)

with mlflow.start_run(experiment_id=mlflow_exp_id):
    # LOG PARAMETERS
    mlflow.log_params(pipeline_params)

    # LOG INPUTS (QUERIES) AND OUTPUTS
    # MLflow example uses a list of strings or a list of str->str dicts
    # Will be useful in STAGING/Evaluation

    # LOG MODEL
    # Instantiate sklearn TFIDFVectorizer
    sklearn_transformer = TfidfVectorizer(**sklearn_transformer_params)

    print("\n")
    print("-" * 80)
    print("sklearn fit transform on ingredients:")

    model_input = whole_nlp_df["ingredients_lemmafied"]

    print("\n")
    print("-" * 80)
    print("Input Data: ")
    print(model_input)

    print("\n")
    print("-" * 80)
    print("Input Data Shape: ")
    print(model_input.shape)

    random_sample = model_input.sample(3, random_state=200)

    print("\n")
    print("-" * 80)
    print("Random 3 Records from Input Data: ")
    print(random_sample)

    # Do fit transform on data
    response = sklearn_transformer.fit_transform(tqdm(model_input))

    transformed_recipe = pd.DataFrame(
        response.toarray(),
        columns=sklearn_transformer.get_feature_names_out(),
        index=model_input.index,
    )

    signature = infer_signature(
        model_input=model_input, model_output=transformed_recipe
    )

    print("\n")
    print("-" * 80)
    print("Transformed Data:")
    print(transformed_recipe.head())

    combined_df = transformed_recipe.join(model_input, how="inner")
    combined_df_sample = transformed_recipe.join(random_sample, how="inner")

    print("\n")
    print("-" * 80)
    print("Random Sample of Combined Data:")
    print(combined_df_sample.head())

    with open(sklearn_transformer_path, "wb") as fo:
        pickle.dump(sklearn_transformer, fo)

    transformed_recipe.to_parquet(
        path=transformed_recipes_parquet_path, compression="gzip"
    )

    combined_df.to_parquet(path=combined_df_path, compression="gzip")

    combined_df_sample.to_parquet(path=combined_df_sample_path)

    model_info = mlflow.pyfunc.log_model(
        code_path=["../src/backend/"],
        python_model=CustomSKLearnWrapper(),
        input_example=whole_nlp_df["ingredients_lemmafied"][0],
        signature=signature,
        artifact_path="sklearn_model",
        artifacts=artifacts,
    )

    # since this uses a custom Stanza analyzer, we have to use a custom mlflow.Pyfunc.PythonModel



--------------------------------------------------------------------------------
sklearn fit transform on ingredients:


--------------------------------------------------------------------------------
Input Data: 
id
54a2b6b019925f464b373351    tablespoon yellow mustard seed brk tablespoon ...
54a408a019925f464b3733bc    pound small leave bulk spinach brk salt brk cu...
54a408a26529d92b2c003631    cup purpose flour brk tablespoon baking powder...
54a408a66529d92b2c003638    small ripe avocado hass see note brk teaspoon ...
54a408a719925f464b3733cc    pound fresh tomato unpeeled cut quarter ounce ...
                                                  ...                        
59541a31bff3052847ae2107    tablespoon unsalt butter room temperature brk ...
5954233ad52ca90dc28200e7    tablespoon stick salt butter room temperature ...
595424c2109c972493636f83    tablespoon unsalted butter more greasing pan b...
5956638625dc3d1d829b7166    coarse salt brk lime wedge brk ounce tomato ju...


100%|██████████| 34756/34756 [00:03<00:00, 10450.53it/s]




--------------------------------------------------------------------------------
Transformed Data:
                          100g  125g  13x9x2  150g  1pound  1tablespoon  \
id                                                                        
54a2b6b019925f464b373351   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a019925f464b3733bc   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a26529d92b2c003631   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a66529d92b2c003638   0.0   0.0     0.0   0.0     0.0          0.0   
54a408a719925f464b3733cc   0.0   0.0     0.0   0.0     0.0          0.0   

                          1teaspoon  200g  250g  2cup  ...  árbol divide  \
id                                                     ...                 
54a2b6b019925f464b373351        0.0   0.0   0.0   0.0  ...           0.0   
54a408a019925f464b3733bc        0.0   0.0   0.0   0.0  ...           0.0   
54a408a26529d92b2c003631        0.0   0.0   0.0   0.0  ...           

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



In [None]:
!dvc add "../data/processed/transformed_recipes.parquet.gzip"

[?25l[32m⠋[0m Checking graph                                                 
Adding...                                                                       
![A
  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0[A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out ../data/processed/transfo0/? [00:00<?,    ?files/s][A
  0%|          |Checking out ../data/processed/transfo0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  5.53file/s][A

To track the changes with git, run:

	git add ../data.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [None]:
!dvc add "../data/processed/combined_df.parquet.gzip"

[?25l[32m⠋[0m Checking graph                                                 
Adding...                                                                       
![A
  0% Checking cache in '/home/awchen/Repos/Projects/MeaLeon/.dvc/cache'| |0/? [0[A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out ../data/processed/combine0/? [00:00<?,    ?files/s][A
  0%|          |Checking out ../data/processed/combine0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00,  5.37file/s][A

To track the changes with git, run:

	git add ../data.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m

In [None]:
# | hide
nbdev.nbdev_export()

Note nbdev2 no longer supports nbdev1 syntax. Run `nbdev_migrate` to upgrade.
See https://nbdev.fast.ai/getting_started.html for more information.
  warn(f"Notebook '{nbname}' uses `#|export` without `#|default_exp` cell.\n"
