In [9]:
import ast
import json

import pandas as pd

In [10]:
# convert "['a','b']" to a set {'a', 'b'}
def to_set(x):
    list_string = json.loads(x)
    list_raw = ast.literal_eval(list_string)
    return set(list_raw)

In [11]:
# Read the CSV
output_df = pd.read_csv("sample/FoodKG_ingredients_normalized.csv")
test_df = pd.read_csv("TEST/TEST_FoodKG_ingredients_normalized.csv")
# Rename test column so we don't clash on merge
test_df_renamed = test_df.rename(columns={"ingredients_normalized": "ingredients_test"})
# Merge using test_df as the left df to keep its order
merged_df = test_df_renamed[["id", "ingredients_test"]].merge(
    output_df, on="id", how="inner"
)
merged_df.head()

Unnamed: 0,id,ingredients_test,title,ingredients,ingredients_normalized
0,0,"""['brown sugar', 'evaporated milk', 'vanilla',...",No-Bake Nut Cookies,"['1 c. firmly packed brown sugar', '1/2 c. eva...","""['brown sugar', 'evaporated milk', 'vanilla',..."
1,1,"""['chipped beef', 'chicken breast', 'cream of ...",Jewell Ball'S Chicken,"['1 small jar chipped beef, cut up', '4 boned ...","""['chipped beef', 'chicken breast', 'cream of ..."
2,2,"""['corn', 'cream cheese', 'butter', 'garlic po...",Creamy Corn,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","""['corn', 'cream cheese', 'butter', 'garlic po..."
3,3,"""['whole chicken', 'chicken gravy', 'cream of ...",Chicken Funny,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","""['chicken', 'chicken gravy', 'mushroom soup',..."
4,4,"""['peanut butter', 'graham cracker', 'butter',...",Reeses Cups(Candy),"['1 c. peanut butter', '3/4 c. graham cracker ...","""['peanut butter', 'graham cracker crumbs', 'b..."


In [12]:
new_df = pd.DataFrame({
    "id": merged_df["id"],
    "ingredients": merged_df["ingredients"],
    "output_set": merged_df["ingredients_normalized"].apply(to_set),
    "test_set": merged_df["ingredients_test"].apply(to_set)
})
new_df.head()

Unnamed: 0,id,ingredients,output_set,test_set
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{butter, shredded rice biscuit, brown sugar, p...","{butter, brown sugar, pecan, vanilla, evaporat..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{cream of mushroom soup, chipped beef, sour cr...","{cream of mushroom soup, chipped beef, sour cr..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{corn, butter, garlic powder, salt, pepper, cr...","{corn, butter, black pepper, garlic powder, sa..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{mushroom soup, chicken gravy, stuffing, chick...","{whole chicken, chicken gravy, stuffing, cream..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{butter, powdered sugar, peanut butter, chocol...","{butter, graham cracker, powdered sugar, choco..."


In [13]:
def precision_recall_f1_per_row(row):
    pred = row["output_set"]
    true = row["test_set"]

    tp = len(pred & true)
    fp = len(pred - true)
    fn = len(true - pred)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return pd.Series({"precision": precision, "recall": recall, "f1": f1})

In [14]:
# Elements that differ
new_df["pred_only"] = new_df["output_set"] - new_df["test_set"]
new_df["true_only"] = new_df["test_set"] - new_df["output_set"]
# intersection count
new_df["intersection_size"] = new_df.apply(
    lambda row: len(row["output_set"] & row["test_set"]), axis=1
)

# union count
new_df["union_size"] = new_df.apply(
    lambda row: len(row["output_set"] | row["test_set"]), axis=1
)

# overlap = intersection / larger_set
new_df["overlap"] = new_df.apply(
    lambda row: (
        len(row["output_set"] & row["test_set"]) /
        max(len(row["output_set"]), len(row["test_set"]))
        if max(len(row["output_set"]), len(row["test_set"])) > 0
        else 0
    ),
    axis=1
)

# Jaccard similarity (intersection / union)
new_df["jaccard"] = new_df.apply(
    lambda row: (
        len(row["output_set"] & row["test_set"]) /
        len(row["output_set"] | row["test_set"])
        if len(row["output_set"] | row["test_set"]) > 0
        else 0
    ),
    axis=1
)

new_df[["precision", "recall", "f1"]] = new_df.apply(precision_recall_f1_per_row, axis=1)

In [15]:
new_df.head(100)

Unnamed: 0,id,ingredients,output_set,test_set,pred_only,true_only,intersection_size,union_size,overlap,jaccard,precision,recall,f1
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{butter, shredded rice biscuit, brown sugar, p...","{butter, brown sugar, pecan, vanilla, evaporat...",{shredded rice biscuit},"{margarine, rice biscuit}",5,8,0.714286,0.625000,0.833333,0.714286,0.769231
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{cream of mushroom soup, chipped beef, sour cr...","{cream of mushroom soup, chipped beef, sour cr...",{},{},4,4,1.000000,1.000000,1.000000,1.000000,1.000000
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{corn, butter, garlic powder, salt, pepper, cr...","{corn, butter, black pepper, garlic powder, sa...",{pepper},{black pepper},5,7,0.833333,0.714286,0.833333,0.833333,0.833333
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{mushroom soup, chicken gravy, stuffing, chick...","{whole chicken, chicken gravy, stuffing, cream...","{chicken, mushroom soup}","{whole chicken, cream of mushroom soup}",3,7,0.600000,0.428571,0.600000,0.600000,0.600000
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{butter, powdered sugar, peanut butter, chocol...","{butter, graham cracker, powdered sugar, choco...","{chocolate chips, graham cracker crumbs}","{chocolate chip, graham cracker}",3,7,0.600000,0.428571,0.600000,0.600000,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,"['3 (1 lb.) cans pork and beans', '1/2 c. bell...","{onion, oil, brown sugar, black pepper, catsup...","{onion, oil, brown sugar, black pepper, catsup...",{},{},9,9,1.000000,1.000000,1.000000,1.000000,1.000000
96,96,"['2 c. diced cooked chicken', '2 Tbsp. shorten...","{onion, vinegar, soy sauce, shortening, cornst...","{onion, vinegar, soy sauce, shortening, cornst...",{},{},14,14,1.000000,1.000000,1.000000,1.000000,1.000000
97,97,"['8 to 10 juicy oranges, peeled and diced', '1...","{orange, orange juice, sugar, coconut, pecan, ...","{orange, orange juice, sugar, coconut, pecan, ...",{},{},6,6,1.000000,1.000000,1.000000,1.000000,1.000000
98,98,"['1 c. creamy peanut butter', '1 c. sugar', '1...","{egg, peanut butter, sugar}","{egg, peanut butter, sugar}",{},{},3,3,1.000000,1.000000,1.000000,1.000000,1.000000


In [16]:
print(f"Overlap:  {new_df['overlap'].mean():.4f}")
print(f"Jaccard:   {new_df['jaccard'].mean():.4f}")
print(f"Precision: {new_df['precision'].mean():.4f}")
print(f"Recall:    {new_df['recall'].mean():.4f}")
print(f"F1:        {new_df['f1'].mean():.4f}")

Overlap:  0.8625
Jaccard:   0.7944
Precision: 0.8729
Recall:    0.8637
F1:        0.8679
