In [11]:
import ast
import json

import pandas as pd

In [12]:
# convert "['a','b']" to a set {'a', 'b'}
def to_set(x):
    list_string = json.loads(x)
    list_raw = ast.literal_eval(list_string)
    return set(list_raw)

In [13]:
# pred_df = pd.read_csv("../data/SpacyProcessing/v1/foodkg_spacy_processed.csv")
pred_df = pd.read_csv("../data/SpacyProcessing/foodkg_spacy_processed.csv")
# Rename the pred columns for so we don't clash on merge
pred_df = pred_df.rename(
    columns={
        "processed_ingredients": "pred_ingredients",
    }
)
pred_df.head(5)

Unnamed: 0,recipe_id,original_ingredients,pred_ingredients
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","['brown sugar', 'milk', 'vanilla', 'nut', 'but..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","['beef', 'chicken breast', 'cream mushroom sou..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","['corn', 'cream cheese', 'butter', 'garlic', '..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","['chicken', 'chicken gravy', 'cream mushroom s..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","['peanut butter', 'graham cracker crumb', 'but..."


In [14]:
test_df = pd.read_csv("../data/test/TEST_FoodKG_ingredients_normalized.csv")
# Rename the test columns for consistency, and so we don't clash on merge
test_df = test_df.rename(
    columns={
        "id": "recipe_id",
        "ingredients": "original_ingredients",
        "ingredients_normalized": "ingredients_test"
    })
test_df.head(5)

Unnamed: 0,recipe_id,original_ingredients,ingredients_test
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","""['brown sugar', 'evaporated milk', 'vanilla',..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","""['chipped beef', 'chicken breast', 'cream of ..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","""['corn', 'cream cheese', 'butter', 'garlic po..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","""['whole chicken', 'chicken gravy', 'cream of ..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","""['peanut butter', 'graham cracker', 'butter',..."


In [15]:
# Merge using test_df as the left df to keep its order
merged_df = test_df[["recipe_id", "ingredients_test"]].merge(
    pred_df, on="recipe_id", how="inner"
)
merged_df.head()

Unnamed: 0,recipe_id,ingredients_test,original_ingredients,pred_ingredients
0,0,"""['brown sugar', 'evaporated milk', 'vanilla',...","['1 c. firmly packed brown sugar', '1/2 c. eva...","['brown sugar', 'milk', 'vanilla', 'nut', 'but..."
1,1,"""['chipped beef', 'chicken breast', 'cream of ...","['1 small jar chipped beef, cut up', '4 boned ...","['beef', 'chicken breast', 'cream mushroom sou..."
2,2,"""['corn', 'cream cheese', 'butter', 'garlic po...","['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","['corn', 'cream cheese', 'butter', 'garlic', '..."
3,3,"""['whole chicken', 'chicken gravy', 'cream of ...","['1 large whole chicken', '2 (10 1/2 oz.) cans...","['chicken', 'chicken gravy', 'cream mushroom s..."
4,4,"""['peanut butter', 'graham cracker', 'butter',...","['1 c. peanut butter', '3/4 c. graham cracker ...","['peanut butter', 'graham cracker crumb', 'but..."


In [16]:
pred_sets = [set(ast.literal_eval(x)) for x in merged_df["pred_ingredients"]]
test_sets = [to_set(x) for x in merged_df["ingredients_test"]]

new_df = pd.DataFrame({
    "recipe_id": merged_df["recipe_id"],
    "original_ingredients": merged_df["original_ingredients"],
    "pred_set": pred_sets,
    "test_set": test_sets,
})
new_df.head()

Unnamed: 0,recipe_id,original_ingredients,pred_set,test_set
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{margarine, rice biscuit, nut, vanilla, milk, ...","{margarine, rice biscuit, pecan, vanilla, evap..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{cream mushroom soup, sour cream, beef, chicke...","{cream of mushroom soup, sour cream, chipped b..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{corn, garlic, cream cheese, pepper, salt, but...","{corn, cream cheese, salt, butter, garlic powd..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{chicken, cheese, chicken gravy, cream mushroo...","{cheese, chicken gravy, cream of mushroom soup..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{powdered sugar, chocolate chip, graham cracke...","{powdered sugar, chocolate chip, graham cracke..."


In [17]:
def precision_recall_f1_per_row(row):
    pred = row["pred_set"]
    test = row["test_set"]

    tp = len(pred & test)
    fp = len(pred - test)
    fn = len(test - pred)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return pd.Series({"precision": precision, "recall": recall, "f1": f1})


def calculate_overlap(row):
    intersection_size = len(row["pred_set"] & row["test_set"])
    n = max(len(row["pred_set"]), len(row["test_set"]))
    if n == 0: return 0
    return intersection_size / n


def calculate_jaccard(row):
    intersection_size = row['intersection_size']
    union_size = row['union_size']
    if union_size == 0: return 0
    return intersection_size / union_size

In [18]:
# Elements that differ
new_df["pred_only"] = new_df["pred_set"] - new_df["test_set"]
new_df["test_only"] = new_df["test_set"] - new_df["pred_set"]
# intersection size
new_df["intersection_size"] = new_df.apply(
    lambda row: len(row["pred_set"] & row["test_set"]), axis=1
)
# union size
new_df["union_size"] = new_df.apply(
    lambda row: len(row["pred_set"] | row["test_set"]), axis=1
)
# overlap = intersection / larger_set
new_df["overlap"] = new_df.apply(calculate_overlap, axis=1)
# Jaccard similarity (intersection / union)
new_df["jaccard"] = new_df.apply(calculate_jaccard, axis=1)
# Precision, recall, F1
new_df[["precision", "recall", "f1"]] = new_df.apply(precision_recall_f1_per_row, axis=1)

In [19]:
new_df.head(100)

Unnamed: 0,recipe_id,original_ingredients,pred_set,test_set,pred_only,test_only,intersection_size,union_size,overlap,jaccard,precision,recall,f1
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{margarine, rice biscuit, nut, vanilla, milk, ...","{margarine, rice biscuit, pecan, vanilla, evap...","{milk, nut}","{pecan, evaporated milk}",5,9,0.714286,0.555556,0.714286,0.714286,0.714286
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{cream mushroom soup, sour cream, beef, chicke...","{cream of mushroom soup, sour cream, chipped b...","{cream mushroom soup, beef}","{cream of mushroom soup, chipped beef}",2,6,0.500000,0.333333,0.500000,0.500000,0.500000
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{corn, garlic, cream cheese, pepper, salt, but...","{corn, cream cheese, salt, butter, garlic powd...","{pepper, garlic}","{garlic powder, black pepper}",4,8,0.666667,0.500000,0.666667,0.666667,0.666667
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{chicken, cheese, chicken gravy, cream mushroo...","{cheese, chicken gravy, cream of mushroom soup...","{chicken, cream mushroom soup}","{cream of mushroom soup, whole chicken}",3,7,0.600000,0.428571,0.600000,0.600000,0.600000
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{powdered sugar, chocolate chip, graham cracke...","{powdered sugar, chocolate chip, graham cracke...",{graham cracker crumb},{graham cracker},4,6,0.800000,0.666667,0.800000,0.800000,0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,"['3 (1 lb.) cans pork and beans', '1/2 c. bell...","{catsup, onion, oil, bean, bell pepper, beef, ...","{catsup, onion, oil, bell pepper, black pepper...","{pork, bean, beef}","{pork and beans, ground beef}",7,12,0.700000,0.583333,0.700000,0.777778,0.736842
96,96,"['2 c. diced cooked chicken', '2 Tbsp. shorten...","{chicken, ginger, catsup, vinegar, chicken bou...","{chicken, ginger, catsup, vinegar, soy sauce, ...","{soy, chicken bouillon cube}","{chicken bouillon, soy sauce}",12,16,0.857143,0.750000,0.857143,0.857143,0.857143
97,97,"['8 to 10 juicy oranges, peeled and diced', '1...","{orange juice, pecan, juicy orange, cherry, mo...","{coconut, orange juice, pecan, orange, cherry,...","{moist coconut, juicy orange}","{coconut, orange}",4,8,0.666667,0.500000,0.666667,0.666667,0.666667
98,98,"['1 c. creamy peanut butter', '1 c. sugar', '1...","{egg, creamy peanut butter, sugar}","{egg, peanut butter, sugar}",{creamy peanut butter},{peanut butter},2,4,0.666667,0.500000,0.666667,0.666667,0.666667


In [20]:
print(f"Overlap:   {new_df['overlap'].mean():.4f} (Intersection divided by the size of the larger set)")
print(f"Jaccard:   {new_df['jaccard'].mean():.4f} (Intersection divided by the size of the union)")
print(f"Precision: {new_df['precision'].mean():.4f} (Fraction of predicted ingredients that are correct)")
print(f"Recall:    {new_df['recall'].mean():.4f} (Fraction of actual ingredients that were predicted)")
print(f"F1:        {new_df['f1'].mean():.4f} (Harmonic mean of precision and recall)")

Overlap:   0.7075 (Intersection divided by the size of the larger set)
Jaccard:   0.6054 (Intersection divided by the size of the union)
Precision: 0.7321 (Fraction of predicted ingredients that are correct)
Recall:    0.7202 (Fraction of actual ingredients that were predicted)
F1:        0.7235 (Harmonic mean of precision and recall)


Prev stats

Overlap:   0.6933 (Intersection divided by the size of the larger set)

Jaccard:   0.5908 (Intersection divided by the size of the union)

Precision: 0.7214 (Fraction of predicted ingredients that are correct)

Recall:    0.7027 (Fraction of actual ingredients that were predicted)

F1:        0.7096 (Harmonic mean of precision and recall)
