# | default_exp Testing Polars for EDA, migrating cleaned data to Vespa

In [None]:
# | hide
import adbc_driver_postgresql.dbapi
from datetime import datetime
from enum import Enum
import json
import polars as pl
import requests
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import vespa

1. Migrate data from postgres to vespa
   a. Need to do some data cleaning on the cuisines since there's typos
   Think postgres database can be a rawer form of the scrapes

In [None]:
# make sure postgres is running!
# load postgres credentials
postgres_key_path = "../secrets/postgres_login.json"
with open(postgres_key_path, "r") as fo:
    postgres_key = json.loads(fo.read())
    user = postgres_key["user"]
    password = postgres_key["password"]
    host = postgres_key["host"]


In [None]:
# try conncting to Postgresql with ADBC
uri = f"postgresql://{user}:{password}@{host}/mealeon"


In [None]:
conn = adbc_driver_postgresql.dbapi.connect(uri)
with conn.cursor() as cur:
    cur.execute("SELECT * FROM recipe_scrapes LIMIT 2")
    print(cur.fetchone())

('AfricanBites-3f1a4fc7e099375adc6f06fbf4389396c3ad5bdfe4661a1980f0ee1143fe0317', 'English', '629883', 'Smoked Spatchcock Turkey', 'AfricanBites', 'https://www.africanbites.com/smoked-spatchcock-turkey/', ['salt and pepper for seasoning', '1 15-pound turkey', '4 tablespoons your choice of turkey rub (Creole, Italian, poultry, or your choice of seasoning)'], 'https://www.africanbites.com/wp-content/uploads/2021/08/IMG_9551-Copy-650x650.jpg', "Enjoy fall-off-the-bone goodness with flavorful juiciness in every bite. Simple smoking delivers flavor penetrating deep into your holiday bird without drying it out. This is your go-to guide for a deliciously tender and juicy turkey that's perfect any time of year!", ['Remove the giblet package.', 'Use your handy kitchen shears to cut the left side of the backbone from the tail to the neck. Do the same to the right side of the backbone. (A sharp knife will work as well.)', 'Flip the turkey over and press on it to break the breast bone and flatten 

In [None]:
# first test, counting number of recipes from each site

query = """
    SELECT
        origin
        , COUNT(DISTINCT mealeon_id) AS num_recipes
    FROM recipe_scrapes
    GROUP BY origin
"""

df = pl.read_database_uri(query, uri, engine="adbc")

df.head()

origin,num_recipes
str,i64
"""AfricanBites""",1491
"""AllRecipes""",4781
"""Epicurious""",4905
"""Panlasang_Pinoy""",1938


In [None]:
# second test: unnesting the cuisine array and counting recipes per cuisine

query = """
    SELECT
        UNNEST(cuisines) AS cuisine
        , COUNT(DISTINCT mealeon_id) AS num_recipes
    FROM recipe_scrapes
    GROUP BY cuisine
"""

df = pl.read_database_uri(query, uri, engine="adbc")

df

cuisine,num_recipes
str,i64
"""30 Minutes or Less""",220
"""5 Ingredients or Fewer""",328
"""AFRICAN CARIBBEAN""",6
"""Absinthe""",6
"""Acorn Squash""",8
…,…
"""Ziti""",2
"""Zucchini""",36
"""dinner""",1
"""soul""",9


In [None]:
with pl.Config(tbl_rows = -1):
    print(df)

shape: (885, 2)
┌─────────────────────────────────┬─────────────┐
│ cuisine                         ┆ num_recipes │
│ ---                             ┆ ---         │
│ str                             ┆ i64         │
╞═════════════════════════════════╪═════════════╡
│ 30 Minutes or Less              ┆ 220         │
│ 5 Ingredients or Fewer          ┆ 328         │
│ AFRICAN CARIBBEAN               ┆ 6           │
│ Absinthe                        ┆ 6           │
│ Acorn Squash                    ┆ 8           │
│ African                         ┆ 503         │
│ African-American                ┆ 1           │
│ African/Carribean               ┆ 3           │
│ Agua Fresca                     ┆ 6           │
│ Air Fryer                       ┆ 15          │
│ Alcohol                         ┆ 392         │
│ Aleppo-Style Pepper             ┆ 1           │
│ Almond                          ┆ 79          │
│ Almond Flour                    ┆ 6           │
│ Amaranth                        

In [None]:
rename_cuisines = {
    "AFRICAN CARIBBEAN": "African-Caribbean",
    "African/Carribean": "African-Caribbean",
    "Carribean": "Caribbean",
    "Southern": "US Southern", 
    "soul": "Soul",
    "thanksgiving": "American",
    "dinner": "American",
    "New": "Fusion"
}

allowed_cuisines = {
    ""
}

# should World and International fall under Fusion?
# creation of super sets? ie: British has English, Irish, Scottish, Welsh. US Southern has Soul

In [None]:
df.filter(pl.col('cuisine') == "dinner")

cuisine,num_recipes
str,i64
"""dinner""",1


In [None]:
df.filter(pl.col('cuisine') == "World")

cuisine,num_recipes
str,i64
"""World""",2


In [None]:
query = """
    SELECT
        mealeon_id
        , title
        , UNNEST(cuisines) AS cuisine
    FROM recipe_scrapes
"""

whole_df = pl.read_database_uri(query, uri, engine="adbc")

whole_df.filter(pl.col('cuisine') == 'dinner')

mealeon_id,title,cuisine
str,str,str
"""AfricanBites-d7711d944dfac2608…","""Smoked Beer Can Chicken""","""dinner"""


In [None]:
# title "Smoked Beer Can Chicken" should become American

In [None]:
whole_df.filter(pl.col('cuisine') == 'World')

mealeon_id,title,cuisine
str,str,str
"""AfricanBites-f6d10c8a7661af70d…","""How to Cook Jasmine Rice""","""World"""
"""AfricanBites-37edc9ea46a786a3b…","""How to Debone Chicken Thighs""","""World"""


In [None]:
whole_df.filter(pl.col('cuisine') == 'thanksgiving')

mealeon_id,title,cuisine
str,str,str
"""AfricanBites-95b6d113f5e590a53…","""Smoked Turkey Legs""","""thanksgiving"""
"""AfricanBites-e94c35b5d5d62c1da…","""Smoked Turkey Breast""","""thanksgiving"""


In [None]:
whole_df.filter(pl.col('cuisine') == 'soul')

mealeon_id,title,cuisine
str,str,str
"""AfricanBites-f3e0779d8b9fa3c85…","""Chitterlings (Chitlins)""","""soul"""
"""AfricanBites-00073465322187887…","""Hog Maw""","""soul"""
"""AfricanBites-1bc8aa91a96c82547…","""Crispy Pork Jowl""","""soul"""
"""AfricanBites-c75aa5fb1e5a28aed…","""Ham Casserole""","""soul"""
"""AfricanBites-0076e7f4c79efd1e1…","""Pickled Pigs’ Feet""","""soul"""
"""AfricanBites-37a94e9f4db8d1dc3…","""Sweet Potato Biscuits""","""soul"""
"""AfricanBites-6f81180e7903637b2…","""Chicken Corn Chowder""","""soul"""
"""AfricanBites-4e1d69c66f031e599…","""Hoppin John""","""soul"""
"""AfricanBites-8761b8303d6685612…","""Air-Fryer Pork Chops""","""soul"""


In [None]:
whole_df.filter(pl.col('cuisine') == 'New')

mealeon_id,title,cuisine
str,str,str
"""AllRecipes-af169286bdd939a9e5e…","""Kiwi Fruit Salsa""","""New"""
"""AllRecipes-3440eb9d648fa5bea9a…","""Yummy Pikelets""","""New"""
"""AllRecipes-11d30e76f14355834af…","""Shearers' Mince and Potato Hot…","""New"""
"""AllRecipes-e7642daa54b6e84c683…","""Dad's New Zealand Mince Stew""","""New"""
"""AllRecipes-7c91bff5430c2a2c412…","""Barbequed Thai Style Chicken""","""New"""
…,…,…
"""AllRecipes-1a4a1cafe5f2f5e9ee8…","""Keligun Chicken""","""New"""
"""AllRecipes-9f5d8a8347394ba3782…","""Skite Cake""","""New"""
"""AllRecipes-467c30863817f7dd199…","""Turkey Stir Fry with Lychees""","""New"""
"""AllRecipes-d72d9be6ebb2a8f01c3…","""Hamburger Sarah Style""","""New"""


In [None]:
with pl.Config(tbl_rows = -1):
    print(whole_df.filter(pl.col('cuisine') == 'Missing Cuisine'))

shape: (2_060, 3)
┌─────────────────────────────────┬─────────────────────────────────┬─────────────────┐
│ mealeon_id                      ┆ title                           ┆ cuisine         │
│ ---                             ┆ ---                             ┆ ---             │
│ str                             ┆ str                             ┆ str             │
╞═════════════════════════════════╪═════════════════════════════════╪═════════════════╡
│ Epicurious-bafe135c3bb2cd9ea06… ┆ 43 Impressive Vegan Desserts f… ┆ Missing Cuisine │
│ Epicurious-1421c95a14fab4c1fcc… ┆ 83 Easy Summer Desserts         ┆ Missing Cuisine │
│ Epicurious-c29b348db0bde236ec7… ┆ 73 Finger Foods for Toddlers    ┆ Missing Cuisine │
│ Epicurious-382b5ae308c3559ce3f… ┆ The 8 Best Kitchen Tools For N… ┆ Missing Cuisine │
│ Epicurious-7b3c3b38d467e433b19… ┆ A Breastfeeding Parent's Snack… ┆ Missing Cuisine │
│ Epicurious-feafbd9ea5d23d83671… ┆ For Punchier Pantry Pasta, Add… ┆ Missing Cuisine │
│ Epicurious-1

In [None]:
# leave Missing Cuisine as is, could be used as test data?

# now that we unpacked the cuisines, we actually want to fix some of these inconstencies

In [None]:
rename_cuisines = {
    "AFRICAN CARIBBEAN": "African-Caribbean",
    "African/Carribean": "African-Caribbean",
    "Carribean": "Caribbean",
    "Southern": "US Southern", 
    "soul": "Soul",
    "thanksgiving": "American",
    "dinner": "American",
    "New": "Fusion"
}

whole_df = whole_df.with_columns(replaced=pl.col("cuisine").replace(rename_cuisines))

In [None]:
whole_df

mealeon_id,title,cuisine,replaced
str,str,str,str
"""AfricanBites-3f1a4fc7e099375ad…","""Smoked Spatchcock Turkey""","""Southern""","""US Southern"""
"""AfricanBites-ad9870f6689604624…","""How to Brine a Turkey""","""American""","""American"""
"""AfricanBites-9b119ba4403faa097…","""Refried Beans""","""Mexican""","""Mexican"""
"""AfricanBites-378e4cd9d9469919c…","""Lemon Blueberry Scones""","""American""","""American"""
"""AfricanBites-378e4cd9d9469919c…","""Lemon Blueberry Scones""","""British""","""British"""
…,…,…,…
"""Epicurious-c2d14b4a9c8ce565e0c…","""Recipes From Bon Appétit’s 202…","""Missing Cuisine""","""Missing Cuisine"""
"""Epicurious-6e1a2c4e1c055fdd6c8…","""Pesto Pasta with V8® Healthy G…","""Missing Cuisine""","""Missing Cuisine"""
"""Epicurious-a3b0de4275f85201f5c…","""Paprika Shrimp with Buttery V8…","""Missing Cuisine""","""Missing Cuisine"""
"""Epicurious-02b70304b781f776b55…","""Sesame Oat Apple Crisp""","""Missing Cuisine""","""Missing Cuisine"""


In [None]:
with pl.Config(tbl_rows = -1):
    q = (
        whole_df.lazy()
        .group_by("replaced")
        .agg(
            pl.len().alias("recipe_count"),
            pl.col("cuisine", "title"),
        )
        .with_columns(pl.col("cuisine").list.unique().alias("unique_cuisines"), 
            )
        .sort("recipe_count", descending=True)
    )

    temp = q.collect()
    print(temp.select(pl.col('replaced', 'unique_cuisines')))

shape: (880, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ replaced                        ┆ unique_cuisines                 │
│ ---                             ┆ ---                             │
│ str                             ┆ list[str]                       │
╞═════════════════════════════════╪═════════════════════════════════╡
│ Mexican                         ┆ ["Mexican"]                     │
│ Missing Cuisine                 ┆ ["Missing Cuisine"]             │
│ Filipino                        ┆ ["Filipino"]                    │
│ Cookbooks                       ┆ ["Cookbooks"]                   │
│ Nut Free                        ┆ ["Nut Free"]                    │
│ Vegetable                       ┆ ["Vegetable"]                   │
│ Dinner                          ┆ ["Dinner"]                      │
│ Easy                            ┆ ["Easy"]                        │
│ Vegetarian                      ┆ ["Vegetarian"]                  │
│ Gl

In [None]:
# want to see how many recipes have more than one cuisine
# look in replaced, count the number in replaced, order by count descending

# with pl.Config(tbl_rows = -1):
#     q = (
#         whole_df.lazy()
#         # .group_by("replaced")
#         # .agg(
#         #     pl.len().alias("recipe_count"),
#         #     pl.col("cuisine", "title"),
#         # )
#         .with_columns(pl.col("replaced").list.len().alias("num_cuisines"), 
#             )
#         .sort("num_cuisines", descending=True)
#     )

#     temp = q.collect()
#     print(temp)

In [None]:
# with pl.Config(tbl_rows = -1):
#     q = (
#         whole_df.lazy()
#         # .group_by("replaced")
#         # .agg(
#         #     pl.len().alias("recipe_count"),
#         #     pl.col("cuisine", "title"),
#         # )
#         .with_columns(pl.col("cuisine").list.len().alias("num_cuisines"), 
#             )
#         .sort("num_cuisines", descending=True)
#     )

#     temp = q.collect()
#     print(temp)

Unclear if there are actually multiple cuisines in some recipes or not. Scraper should've allowed that behavior since it's a list of strings, but let's look in postgres

In [None]:
# unnesting the cuisine array and counting cuisines per recipe

query = """
    -- group recipe counts by cuisines counts
    SELECT
        num_cuisines
        , COUNT(DISTINCT mealeon_id) AS num_recipes
    FROM (
        -- unnest the cuisine array and count number of cuisines per recipe
        SELECT
            mealeon_id
            , CARDINALITY(cuisines) AS num_cuisines
        FROM recipe_scrapes
        GROUP BY mealeon_id
    ) cuisine_count
    GROUP BY num_cuisines
    ORDER BY num_cuisines DESC
"""

cuisine_counter_df = pl.read_database_uri(query, uri, engine="adbc")

cuisine_counter_df

num_cuisines,num_recipes
i32,i64
,1
35,2
34,2
33,3
31,3
…,…
5,18
4,11
3,110
2,206


In [None]:
# unnesting the cuisine array and counting cuisines per recipe

query = """
    -- group recipe counts by cuisines counts
    SELECT
        *
    FROM (
        -- unnest the cuisine array and count number of cuisines per recipe
        SELECT
            *
            , CARDINALITY(cuisines) AS num_cuisines
        FROM recipe_scrapes
        GROUP BY mealeon_id
    ) cuisine_count
    WHERE num_cuisines > 1 OR num_cuisines IS NULL
"""

cuisine_counter_df = pl.read_database_uri(query, uri, engine="adbc")

cuisine_counter_df

mealeon_id,language,source_id,title,origin,url,ingredients,photo_url,description,steps,cuisines,num_cuisines
str,str,str,str,str,str,list[str],str,str,list[str],list[str],i32
"""AfricanBites-00073465322187887…","""English""","""662693""","""Hog Maw""","""AfricanBites""","""https://www.africanbites.com/h…","[""3-4 pounds (1.3-1.8k) small pig stomachs (hog maw)"", ""1 tablespoon salt"", … ""Salt and black pepper to taste""]","""https://www.africanbites.com/w…","""A robust, savory dish bursting…","[""Thoroughly wash the hog maw by soaking it in vinegar, baking soda, salt, and water for about an hour. After that, rinse twice under running tap water until clean. "", ""Next, remove any excess fat and cut it into bite-sized pieces. Place the hog maw in a medium cast iron pot, cover entirely with water, and cook over medium-high heat for 60-90 minutes."", … ""Serve and enjoy!""]","[""soul"", ""Southern""]",2
"""AfricanBites-0076e7f4c79efd1e1…","""English""","""652850""","""Pickled Pigs’ Feet""","""AfricanBites""","""https://www.africanbites.com/p…","[""6 pigs feet (1½–2 pounds)"", ""1 cup water"", … ""4 chili peppers""]","""https://www.africanbites.com/w…","""Tender, tangy, and nutritious,…","[""Wash pigs' feet thoroughly, then place them in a large pot and pour water over them until covered. Bring to a boil over medium-high heat, reduce the heat, and simmer for 1½ to 2 hours until the pigs' feet are tender. While they are cooking, skim off any foam that rises to the surface."", ""Remove the feet from the cooking liquid, rinse with water, remove as many bones as possible, and set them aside. Discard the cooking liquid."", … ""Pour the pickling liquid (vinegar mixture) to cover the pigs' feet. Cover the jar tightly with a lid and place in the refrigerator for 3-7 days before serving.""]","[""soul"", ""Southern""]",2
"""AfricanBites-00c41128c7051c143…","""English""","""624662""","""Summer Pasta Recipes""","""AfricanBites""","""https://www.africanbites.com/s…","[""Creamy Chicken Pasta Salad"", ""Easy Pasta Salad"", … ""Stovetop Mac and Cheese""]","""https://www.africanbites.com/w…","""Budget-friendly, kid-friendly,…",[],"[""American"", ""Italian"", ""Southern""]",3
"""AfricanBites-016c4420eec473b58…","""English""","""661486""","""Mississippi Chicken Recipe""","""AfricanBites""","""https://www.africanbites.com/m…","[""3-4 boneless, skinless chicken breasts (about 2 pounds) "", ""1 sweet onion, sliced"", … ""½ teaspoon black pepper, freshly ground""]","""https://www.africanbites.com/w…","""Rich, zesty, and savory, Missi…","[""Lightly grease the bottom of the slow cooker pot with cooking spray, add the sliced onion and smashed garlic to the bottom of the slow cooker, then top with the chicken breasts."", ""Sprinkle all dry ingredients, ranch seasoning, bouillon powder, salt, and pepper, over the chicken. Rub the spice mix on the chicken using a brush. Then add chicken broth and Worcestershire sauce. "", … ""When it's ready, shred the chicken using two forks. Serve with mashed potatoes, rice, or bread. Enjoy!""]","[""American"", ""Southern""]",2
"""AfricanBites-029904f135e9295b3…","""English""","""680968""","""Cornbread With Corn""","""AfricanBites""","""https://www.africanbites.com/c…","[""1 cup all-purpose flour "", ""1¼ cup white cornmeal or yellow cornmeal"", … ""Melted butter (to brush the top of the cornbread after it comes out of the oven)""]","""https://www.africanbites.com/w…","""Crispy edges, a tender, moist …","[""Preheat your oven to 375°F (190°C). Grease a 9-inch square baking dish or cast-iron skillet."", ""In a medium bowl, whisk the flour, cornmeal, baking powder, sugar, and salt."", … ""Bake for 20-25 minutes or until golden brown and a skewer inserted in the center comes out clean.""]","[""American"", ""Southern""]",2
…,…,…,…,…,…,…,…,…,…,…,…
"""Epicurious-ff6bfa7dcd8ae90d5d3…","""English""","""626ec9849ea8cd1feb84929a""","""Mutabbal (Eggplant-Tahini Dip)""","""Epicurious""","""https://www.epicurious.com/rec…","[""2 large globe eggplants (about 2 pounds)"", ""Olive oil, for rubbing eggplants, plus more for drizzling"", … ""1 Tbsp. coarsely chopped parsley""]","""https://assets.epicurious.com/…","""The tahini-laced eggplant vers…","[""Cook the eggplants directly on the burner of a gas stove or in the oven. On a gas stove, sear the eggplants directly over a medium-high flame. Rotate every 5 minutes, until the globes are completely blackened, about 15 minutes total. Alternately, preheat the oven to 450°F. Place the eggplants on a sheet tray, poke them all over with the tines of a fork, rub them with the oil, and place the tray in the oven and cook for 25 to 30 minutes, until the eggplants collapse in on themselves and the skins are blackened."", ""Place the cooked eggplants in a bowl and seal the bowl with plastic wrap to let the eggplants steam in their own juices."", … ""When ready to serve, scoop the eggplant mixture onto a serving plate. Use a spoon to create a well in the center and drizzle in the oil, then garnish with the molasses, pomegranate seeds, and parsley.""]","[""Dip"", ""Middle Eastern"", … ""Cookbooks""]",16
"""Epicurious-ff89f0edeb5f6f5ff15…","""English""","""60464e7515c0f4f277811f25""","""Bamboo""","""Epicurious""","""https://www.epicurious.com/rec…","[""1 part dry vermouth"", ""1 part amontillado sherry"", ""Orange bitters""]","""https://assets.epicurious.com/…","""Mix up this sherry and vermout…","[""For individual drinks, stir vermouth, sherry, and a dash or two of orange bitters together with ice, then strain into a coupe glass. Alternately, build in a rocks glass with ice and stir until chilled."", ""Alternatively, to mix a batch of Bamboo cocktails, combine half a bottle (375 ml) each of vermouth and sherry and a teaspoon or thereabouts of orange bitters in a bottle or resealable jar. Add 1 teaspoon orange bitters. Keep in the fridge to be served when and however you fancy it.""]","[""Cocktail"", ""Alcohol"", … ""Cookbooks""]",7
"""Epicurious-ffa1023bc62d032683e…","""English""","""54a439436529d92b2c01989b""","""Wassail""","""Epicurious""","""https://www.epicurious.com/rec…","[""2 (3”) cinnamon sticks, plus more for garnish"", ""10 whole allspice berries"", … ""1 cup apple brandy, such as Calvados""]","""https://assets.epicurious.com/…","""Wassail means different things…","[""Combine 2 (3”) cinnamon sticks, 10 whole allspice berries, 8 whole cloves, 4 cups cranberry juice cocktail, and 4 cups fresh apple cider in a medium pot; bring to a simmer over medium heat. Cook, uncovered, adjusting heat as needed to maintain a gentle simmer, 10 minutes until spices are infused. Using a slotted spoon or skimmer, fish out and discard allspice berries and cloves, leaving cinnamon sticks in drink."", ""Add 1 Granny Smith apple, quartered, cored, thinly sliced, and 1 cup apple brandy, such as Calvados, to cider mixture; simmer about 2 minutes (apple slices will remain crisp)."", … ""Editor’s note: This wassail recipe was first printed in the December 2002 issue of ‘Gourmet.’ Head this way for more of our favorite hot drink recipes →""]","[""Cocktail"", ""Alcohol"", … ""Berry""]",24
"""Epicurious-ffa7fd0896ed8be6f93…","""English""","""64594f664311959de61fc1b2""","""Green Mango, Cabbage, and Jica…","""Epicurious""","""https://www.epicurious.com/rec…","[""2 cups packed thinly sliced green cabbage leaves"", ""½ small jicama, peeled and cut into medium-thick matchsticks"", … ""⅓ cup finely chopped unsalted roasted peanuts or cashews""]","""https://assets.epicurious.com/…","""Many people assume that an unr…","[""In a large bowl, combine the cabbage, jicama, and mango and set aside. (The vegetables and fruit can be stored, covered, in the refrigerator for up to 24 hours.)"", ""Using a fine rasp grater, such as a Microplane, zest the lime directly into a small bowl. Squeeze the lime to get 2 tablespoons of juice; if you’re short, add vinegar to make up the difference. Add the lime juice to the zest and then add the sugar (the amount depends on your palate and the mango—use more if the mango is sour). Stir to dissolve the sugar, then taste and add more sugar, if needed, for a strong tart-sweet finish. Add enough of the fish sauce to arrive at a bold, salty-tangy finish. Add the garlic and chile, stir, and then set the dressing aside."", ""Toss the vegetables and fruit well with the dressing, mint, and peanuts, until the cabbage and jicama soften slightly. Transfer to a shallow serving bowl, leaving excess dressing behind. Serve immediately.""]","[""Cookbooks"", ""Salad"", … ""Dinner""]",18


So we know that there are recipes with more than 1 cuisines, `replace` text may be overriding? Need to make sure that replacement text goes back into an array of strings

In [None]:
# rename_cuisines = {
#     "AFRICAN CARIBBEAN": "African-Caribbean",
#     "African/Carribean": "African-Caribbean",
#     "Carribean": "Caribbean",
#     "Southern": "US Southern", 
#     "soul": "Soul",
#     "thanksgiving": "American",
#     "dinner": "American",
#     "New": "Fusion"
# }

# whole_df = whole_df.with_columns(replaced_eval=pl.col("cuisine")
#                                  .list.eval(pl.element().replace(rename_cuisines))
#                                  )

In [None]:
with pl.Config(tbl_rows = -1):
    q = (
        whole_df.lazy()
        .group_by("mealeon_id")
        .agg(
            pl.col("replaced")
            .alias("cleaned_cuisines"),
        )
    )

    temp = q.collect()
    print(temp)

shape: (13_114, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ mealeon_id                      ┆ cleaned_cuisines                │
│ ---                             ┆ ---                             │
│ str                             ┆ list[str]                       │
╞═════════════════════════════════╪═════════════════════════════════╡
│ AfricanBites-1b0d897e3d2406616… ┆ ["Italian"]                     │
│ AfricanBites-d4d16f262aebd33e4… ┆ ["American"]                    │
│ Panlasang_Pinoy-2f1ec2d68ed912… ┆ ["Filipino"]                    │
│ Epicurious-1cf53dd9a830f97a0a3… ┆ ["Missing Cuisine"]             │
│ Epicurious-56cf16b0d2b2d344525… ┆ ["Missing Cuisine"]             │
│ AfricanBites-35286a9c17ee8d4b1… ┆ ["US Southern"]                 │
│ AllRecipes-144b0c7becf50285fb8… ┆ ["Indian"]                      │
│ AllRecipes-72fe01d92a72df786a4… ┆ ["African"]                     │
│ Epicurious-a4ae3d3d0ec5f662d9a… ┆ ["Missing Cuisine"]             │
│

Multiple approaches
1. I can join this dataframe with the original one (pulling from the server again) and joining the relabeled cuisine dataframe with the original one
2. Go back and do a list operation on the original data <-

In [None]:
# pull all data

query = """
    SELECT
        *
    FROM recipe_scrapes
"""

all_data_df = pl.read_database_uri(query, uri, engine="adbc")

rename_cuisines = {
    "AFRICAN CARIBBEAN": "African-Caribbean",
    "African/Carribean": "African-Caribbean",
    "Carribean": "Caribbean",
    "Southern": "US Southern", 
    "soul": "Soul",
    "thanksgiving": "American",
    "dinner": "American",
    "New": "Fusion"
}

with pl.Config(tbl_rows = -1):
    q = (
        all_data_df.lazy()
        .with_columns(
            relabeled_cuisines = pl.col("cuisines").list.eval(pl.element().replace(rename_cuisines))
        )
    )


    temp = q.collect()
    print(temp)

shape: (13_115, 12)
┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ mealeon_i ┆ language ┆ source_id ┆ title     ┆ … ┆ descripti ┆ steps     ┆ cuisines  ┆ relabeled │
│ d         ┆ ---      ┆ ---       ┆ ---       ┆   ┆ on        ┆ ---       ┆ ---       ┆ _cuisines │
│ ---       ┆ str      ┆ str       ┆ str       ┆   ┆ ---       ┆ list[str] ┆ list[str] ┆ ---       │
│ str       ┆          ┆           ┆           ┆   ┆ str       ┆           ┆           ┆ list[str] │
╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ AfricanBi ┆ English  ┆ 629883    ┆ Smoked    ┆ … ┆ Enjoy fal ┆ ["Remove  ┆ ["Souther ┆ ["US Sout │
│ tes-3f1a4 ┆          ┆           ┆ Spatchcoc ┆   ┆ l-off-the ┆ the       ┆ n"]       ┆ hern"]    │
│ fc7e09937 ┆          ┆           ┆ k Turkey  ┆   ┆ -bone     ┆ giblet    ┆           ┆           │
│ 5ad…      ┆          ┆           ┆           ┆   ┆ goodne…   ┆ packag

In [None]:
# temp.filter(
#     pl.col("relabeled_cuisines").list.len() > 1
# )


In [None]:
# temp.filter(
#     pl.col("relabeled_cuisines").list.len() == 0
# )

In [None]:
# temp.filter(
#     pl.col("cuisines").list.len() == 0
# )

In [None]:
# temp.filter(
#     pl.col("relabeled_cuisines").is_null()
# )

We have successfully cleaned up the cuisine labels, should now migrate these futher cleaned recipes into Postgres and then Vespa to create embeddings

In [None]:
# send dataframe back into Postgres
# cleaned_data_uri = f"postgresql://{user}:{password}@{host}/mealeon"

# temp.write_database(table_name="cleaned_recipes", connection=cleaned_data_uri, engine="adbc")

In [None]:
# conn = adbc_driver_postgresql.dbapi.connect(cleaned_data_uri)
# with conn.cursor() as cur:
#     cur.execute("SELECT * FROM cleaned_recipes LIMIT 2")
#     print(cur.fetchone())

Whoops, didn't remove the original column

May need to check scraper: words with embedded links in the steps or ingredients may have been dropped 

In [None]:
temp.drop_in_place("cuisines")


cuisines
list[str]
"[""Southern""]"
"[""American""]"
"[""Mexican""]"
"[""American"", ""British""]"
"[""Southern""]"
…
"[""Missing Cuisine""]"
"[""Missing Cuisine""]"
"[""Missing Cuisine""]"
"[""Missing Cuisine""]"


In [None]:
temp = temp.rename({"relabeled_cuisines": "cuisines"})

In [None]:
print(temp)

shape: (13_115, 11)
┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ mealeon_i ┆ language ┆ source_id ┆ title     ┆ … ┆ photo_url ┆ descripti ┆ steps     ┆ cuisines  │
│ d         ┆ ---      ┆ ---       ┆ ---       ┆   ┆ ---       ┆ on        ┆ ---       ┆ ---       │
│ ---       ┆ str      ┆ str       ┆ str       ┆   ┆ str       ┆ ---       ┆ list[str] ┆ list[str] │
│ str       ┆          ┆           ┆           ┆   ┆           ┆ str       ┆           ┆           │
╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ AfricanBi ┆ English  ┆ 629883    ┆ Smoked    ┆ … ┆ https://w ┆ Enjoy fal ┆ ["Remove  ┆ ["US Sout │
│ tes-3f1a4 ┆          ┆           ┆ Spatchcoc ┆   ┆ ww.africa ┆ l-off-the ┆ the       ┆ hern"]    │
│ fc7e09937 ┆          ┆           ┆ k Turkey  ┆   ┆ nbites.co ┆ -bone     ┆ giblet    ┆           │
│ 5ad…      ┆          ┆           ┆           ┆   ┆ m/w…      ┆ goodne

In [None]:
# send dataframe back into Postgres
cleaned_data_uri = f"postgresql://{user}:{password}@{host}/mealeon"

temp.write_database(table_name="cleaned_recipes", connection=cleaned_data_uri, engine="adbc", if_table_exists="replace")

13115

In [None]:
print(temp)

shape: (13_115, 11)
┌───────────┬──────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ mealeon_i ┆ language ┆ source_id ┆ title     ┆ … ┆ photo_url ┆ descripti ┆ steps     ┆ cuisines  │
│ d         ┆ ---      ┆ ---       ┆ ---       ┆   ┆ ---       ┆ on        ┆ ---       ┆ ---       │
│ ---       ┆ str      ┆ str       ┆ str       ┆   ┆ str       ┆ ---       ┆ list[str] ┆ list[str] │
│ str       ┆          ┆           ┆           ┆   ┆           ┆ str       ┆           ┆           │
╞═══════════╪══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ AfricanBi ┆ English  ┆ 629883    ┆ Smoked    ┆ … ┆ https://w ┆ Enjoy fal ┆ ["Remove  ┆ ["US Sout │
│ tes-3f1a4 ┆          ┆           ┆ Spatchcoc ┆   ┆ ww.africa ┆ l-off-the ┆ the       ┆ hern"]    │
│ fc7e09937 ┆          ┆           ┆ k Turkey  ┆   ┆ nbites.co ┆ -bone     ┆ giblet    ┆           │
│ 5ad…      ┆          ┆           ┆           ┆   ┆ m/w…      ┆ goodne

Might have to dump to json, run conversion script, dump that to json

In [None]:
# polars_export_path = "../data/raw/postgres_table_dump.json"
# temp.write_json(polars_export_path)

In [None]:
temp.filter((pl.col("ingredients").len() < 1) | (pl.col("steps").len() < 1) | (pl.col("cuisines").len() < 1))

mealeon_id,language,source_id,title,origin,url,ingredients,photo_url,description,steps,cuisines
str,str,str,str,str,str,list[str],str,str,list[str],list[str]


In [None]:
temp.filter((pl.col("ingredients").is_null()) | (pl.col("steps").is_null()) | (pl.col("cuisines").is_null()))

mealeon_id,language,source_id,title,origin,url,ingredients,photo_url,description,steps,cuisines
str,str,str,str,str,str,list[str],str,str,list[str],list[str]
"""AllRecipes-e556569d6282f10623b…","""English""","""222014""","""""Pantry Raid"" Chicken Enchilad…","""AllRecipes""","""https://www.allrecipes.com/rec…","[""1 (15 ounce) can tomato sauce¼ cup water1 envelope taco seasoning mix1 ½ tablespoons chili powder1 tablespoon vegetable oil1 pound chicken breast tenderloins1 (15 ounce) can black beans, drained¼ cup cream cheese1 cup shredded Mexican-style cheese blend, or more to taste1 (7.5 ounce) package corn bread mix1 egg⅓ cup milk""]","""https://imagesvc.meredithcorp.…","""I made this recipe up one nigh…","[""Preheat the oven to 375 degrees F (190 degrees C). Grease a 9x9-inch baking dish.Mix tomato sauce, water, taco seasoning mix, and chili powder together in a saucepan; bring to a simmer over medium heat.Heat vegetable oil in a skillet over medium heat and brown chicken tenderloins on both sides, about 5 minutes per side. Pour tomato sauce mixture over the chicken, bring to a simmer, and cook over medium-low heat until chicken tenderloins are no longer pink inside, about 8 minutes. Transfer chicken to a bowl and shred; return shredded chicken to the sauce. Mix in black beans and cream cheese until thoroughly combined.Pour chicken mixture into prepared baking dish. Top with shredded Mexican cheese. Whisk corn bread mix, egg, and milk in a bowl, and spoon the batter over the chicken mix.Bake in the preheated oven until the casserole is bubbling and the corn bread topping is browned and set, about 30 minutes.""]",


In [None]:
temp = temp.with_columns(pl.col("cuisines").fill_null(["Missing Cuisine"]))

In [None]:
temp.filter((pl.col("ingredients").is_null()) | (pl.col("steps").is_null()) | (pl.col("cuisines").is_null()))

mealeon_id,language,source_id,title,origin,url,ingredients,photo_url,description,steps,cuisines
str,str,str,str,str,str,list[str],str,str,list[str],list[str]


In [None]:
# polars_export_path = "../data/raw/postgres_table_dump.json"
# temp.write_json(polars_export_path)

In [None]:
# convert postgres records into Vespa format
# example structure of a record in vespa format
# vespa_record = {
#                 "put": f"id:{doc_type}:{doc_type}::{['recipe']['source']}-{record['id']}",
#                 "fields": {
#                     "origin": "epicurious", # replace with ['recipe']['source']
#                     "id": "",
#                     "title": "",
#                     "ingredients": "",
#                     "steps": "",
#                     "cuisine": "",
#                 },
#             }

# {"put": f"id:{doc_type}:{doc_type}::{record[mealeon_id]}",
#  "fields": {
#     "origin": record[origin], # not sure if needed, can just join with postgres
#     "id": record[source_id], # not sure if needed, can just join with postgres or use mealeon_id for join
#     "title": record[title], 
#     "ingredients": record[ingredients],
#     "steps": record[steps],
#     "description": record[description],
#     "cuisines": record[relabeled_cuisine]
#     }
# }

### Try mixing in PyVespa
Following documentation [here](https://pyvespa.readthedocs.io/en/latest/getting-started-pyvespa.html)

In [None]:
from vespa.package import (
    ApplicationPackage,
    Field,
    Schema,
    Document,
    HNSW,
    RankProfile,
    Component,
    Parameter,
    FieldSet,
    GlobalPhaseRanking,
    Function,
    DocumentSummary,
    Summary
)

package = ApplicationPackage(
    name="mealeon2",
    schema=[
        Schema(
            name="mealeon2",
            document=Document(
                fields=[
                    Field(
                        name="language", 
                        type="string", 
                        indexing=["set_language"],
                        # match=["word"]
                    ),
                    Field(
                        name="id",
                        type="string",
                        indexing=["attribute", "summary"],
                        match=["word"],
                        bolding=True,
                    ),
                    Field(
                        name="title",
                        type="string",
                        indexing=["index", "summary"],
                        index="enable-bm25",
                        match=["word"]
                    ), 
                    Field(
                        name="description",
                        type="string",
                        indexing=["index"],
                        index="enable-bm25",
                        match=["word"],
                    ),                 
                    Field(
                        name="ingredients",
                        type="array<string>",
                        indexing=["index", "attribute"],
                        # attribute="fast-search",
                        index="enable-bm25",
                        # match=["word"],
                    ),
                    Field(
                        name="steps",
                        type="array<string>",
                        indexing=["index", "attribute"],
                        index="enable-bm25"
                    ),
                    Field(
                        name="cuisines",
                        type="array<string>",
                        indexing=["index", "attribute", "summary"],
                        index="enable-bm25",
                        match=["text"],
                    ),
                    # Field(
                    #     name="embedding",
                    #     type="tensor<float>(x[384])",
                    #     indexing=[
                    #         'input title . " " . input body',
                    #         "embed",
                    #         "index",
                    #         "attribute",
                    #     ],
                    #     ann=HNSW(distance_metric="angular"),
                    #     is_document_field=False,
                    # ),
                ]
            ),
            fieldsets=[
                FieldSet(
                    name="default", 
                    fields=["title", "ingredients"]
                )
            ],
            document_summaries=[
                    DocumentSummary(
                    name="document-summary",
                    summary_fields=[
                        Summary("id")
                    ]
                ),
            ],
            rank_profiles=[
                RankProfile(
                    name="default",
                    first_phase="nativeRank(title, ingredients)"
                ),
                RankProfile(
                    name="bm25",
                    inherits="default",
                    first_phase="bm25(title) + bm25(ingredients)",
                    # inputs=[("query(q)", "tensor<float>(x[384])")],
                    functions=[
                        Function(name="bm25sum", expression="bm25(title) + bm25(ingredients)")
                    ],
                ),
                RankProfile(
                    name="combined", 
                    inherits="default",
                    first_phase="bm25(title) + bm25(ingredients) + nativeRank(title) + nativeRank(ingredients)",
                    functions=[
                        Function(name="bm25nativeRank",
                                 expression="bm25(title) + bm25(ingredients) + nativeRank(title) + nativeRank(ingredients)")
                    ]
                )
                # RankProfile(
                #     name="semantic",
                #     inputs=[("query(q)", "tensor<float>(x[384])")],
                #     first_phase="closeness(field, embedding)",
                # ),
                # RankProfile(
                #     name="fusion",
                #     inherits="bm25",
                #     inputs=[("query(q)", "tensor<float>(x[384])")],
                #     first_phase="closeness(field, embedding)",
                #     global_phase=GlobalPhaseRanking(
                #         expression="reciprocal_rank_fusion(bm25sum, closeness(field, embedding))",
                #         rerank_count=1000,
                #     ),
                # ),
            ],
        )
    ],
    # components=[
    #     Component(
    #         id="e5",
    #         type="hugging-face-embedder",
    #         parameters=[
    #             Parameter(
    #                 "transformer-model",
    #                 {
    #                     "url": "https://github.com/vespa-engine/sample-apps/raw/master/simple-semantic-search/model/e5-small-v2-int8.onnx"
    #                 },
    #             ),
    #             Parameter(
    #                 "tokenizer-model",
    #                 {
    #                     "url": "https://raw.githubusercontent.com/vespa-engine/sample-apps/master/simple-semantic-search/model/tokenizer.json"
    #                 },
    #             ),
    #         ],
    #     )
    # ],
)

In [None]:
# try mixing in PyVespa

from vespa.deployment import VespaDocker

vespa_docker = VespaDocker(port=8182,
                           cfgsrv_port=19091)
app = vespa_docker.deploy(application_package=package)


Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8182/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8182/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8182/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8182/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8182/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8182/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8182/ApplicationStatus
Application is up!
Finished deployment.


In [None]:
!vespa feed ../data/raw/postgres_mealeon_vespa.json --target http://localhost:8182

{
  "feeder.operation.count": 13115,
  "feeder.seconds": 15.250,
  "feeder.ok.count": 13115,
  "feeder.ok.rate": 859.992,
  "feeder.error.count": 0,
  "feeder.inflight.count": 0,
  "http.request.count": 13115,
  "http.request.bytes": 11253214,
  "http.request.MBps": 0.738,
  "http.exception.count": 0,
  "http.response.count": 13115,
  "http.response.bytes": 3041794,
  "http.response.MBps": 0.199,
  "http.response.error.count": 0,
  "http.response.latency.millis.min": 10,
  "http.response.latency.millis.avg": 48,
  "http.response.latency.millis.max": 446,
  "http.response.code.counts": {
    "200": 13115
  }
}


In [None]:
from vespa.io import VespaResponse, VespaQueryResponse

# query should be recipe name?
    # WHERE title !contains {query}
# cuisine name should be in the WHERE filter clause of YQL
    # AND WHERE cuisine NOT IN {cuisines}
# how to penalize similar title?

# start with plain keyword search

with app.syncio(connections=1) as session:
    query = "Buffalo Wings"
    response: VespaQueryResponse = session.query(
        yql=f"select * from sources mealeon2 where (title contains '{query}') limit 10",
        query=query,
        ranking="bm25"
        # body={"input.query(q)": f"embed({query})"},
    )
    assert response.is_successful()

In [None]:
print(response.hits)

[{'id': 'id:mealeon2:mealeon2::Epicurious-1c0fe6496b26a8b7bdf549e9ed70bc9802d7df404b89f1d3de6d01a1a44d31f8', 'relevance': 9.076123029856374, 'source': 'mealeon2_content', 'fields': {'sddocname': 'mealeon2', 'id': 'Epicurious-1c0fe6496b26a8b7bdf549e9ed70bc9802d7df404b89f1d3de6d01a1a44d31f8', 'documentid': 'id:mealeon2:mealeon2::Epicurious-1c0fe6496b26a8b7bdf549e9ed70bc9802d7df404b89f1d3de6d01a1a44d31f8', 'title': 'Buffalo Wings', 'cuisines': ['Gourmet', 'Super Bowl', 'Hot Sauce', 'Condiment', 'American', 'Chicken Wing', 'Chicken', 'Poultry', 'Yogurt', 'Dairy', 'Main', 'Starter', 'Dinner', 'Lunch', 'Gluten Free', 'Nut Free', 'Keto', 'Deep Fry']}}]


In [None]:
next_resp_json = next_resp.json()
next_resp_json

NameError: name 'next_resp' is not defined

In [None]:
# actual results
results = resp_json['hits']
results

In [None]:
# | hide
nbdev.nbdev_export()