In [1]:
import ast
import json

import pandas as pd

In [2]:
def to_set(x):
    list_string = json.loads(x)
    list_raw = ast.literal_eval(list_string)
    return set(list_raw)

In [3]:
# Read the CSV
output_df = pd.read_csv("sample/FoodKG_ingredients_normalized.csv")
test_df = pd.read_csv("TEST/TEST_FoodKG_ingredients_normalized.csv")
# Rename test column so we don't clash on merge
test_df_renamed = test_df.rename(columns={"ingredients_normalized": "ingredients_test"})
# Merge using test_df as the left df to keep its order
merged_df = test_df_renamed[["id", "ingredients_test"]].merge(
    output_df, on="id", how="inner"
)
merged_df.head()

Unnamed: 0,id,ingredients_test,ingredients,ingredients_normalized
0,0,"""['brown sugar', 'evaporated milk', 'vanilla',...","['1 c. firmly packed brown sugar', '1/2 c. eva...","""['brown sugar', 'evaporated milk', 'vanilla',..."
1,1,"""['chipped beef', 'chicken breast', 'cream of ...","['1 small jar chipped beef, cut up', '4 boned ...","""['chipped beef', 'chicken breast', 'cream of ..."
2,2,"""['corn', 'cream cheese', 'butter', 'garlic po...","['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","""['corn', 'cream cheese', 'butter', 'garlic po..."
3,3,"""['whole chicken', 'chicken gravy', 'cream of ...","['1 large whole chicken', '2 (10 1/2 oz.) cans...","""['whole chicken', 'chicken gravy', 'cream of ..."
4,4,"""['peanut butter', 'graham cracker', 'butter',...","['1 c. peanut butter', '3/4 c. graham cracker ...","""['peanut butter', 'graham cracker crumb', 'bu..."


In [4]:
new_df = pd.DataFrame({
    "id": merged_df["id"],
    "ingredients": merged_df["ingredients"],
    "output_set": merged_df["ingredients_normalized"].apply(to_set),
    "test_set": merged_df["ingredients_test"].apply(to_set)
})
new_df.head()

Unnamed: 0,id,ingredients,output_set,test_set
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{vanilla, pecan, margarine, rice biscuit, brow...","{vanilla, pecan, margarine, rice biscuit, brow..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{chicken breast, cream of mushroom soup, sour ...","{chicken breast, cream of mushroom soup, sour ..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{corn, salt, black pepper, garlic powder, butt...","{corn, salt, black pepper, garlic powder, butt..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{cheese, cream of mushroom soup, whole chicken...","{cheese, cream of mushroom soup, whole chicken..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{graham cracker crumb, powdered sugar, chocola...","{graham cracker, powdered sugar, chocolate chi..."


In [5]:
def precision_recall_f1_per_row(row):
    pred = row["output_set"]
    true = row["test_set"]

    tp = len(pred & true)
    fp = len(pred - true)
    fn = len(true - pred)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return pd.Series({"precision": precision, "recall": recall, "f1": f1})

In [6]:
# Elements that differ
new_df["difference"] = new_df["output_set"] - new_df["test_set"]
# intersection count
new_df["intersection_size"] = new_df.apply(
    lambda row: len(row["output_set"] & row["test_set"]), axis=1
)

# union count
new_df["union_size"] = new_df.apply(
    lambda row: len(row["output_set"] | row["test_set"]), axis=1
)

# accuracy = intersection / larger_set
new_df["accuracy"] = new_df.apply(
    lambda row: (
        len(row["output_set"] & row["test_set"]) /
        max(len(row["output_set"]), len(row["test_set"]))
        if max(len(row["output_set"]), len(row["test_set"])) > 0
        else 0
    ),
    axis=1
)

# Jaccard similarity (intersection / union)
new_df["jaccard"] = new_df.apply(
    lambda row: (
        len(row["output_set"] & row["test_set"]) /
        len(row["output_set"] | row["test_set"])
        if len(row["output_set"] | row["test_set"]) > 0
        else 0
    ),
    axis=1
)

new_df[["precision", "recall", "f1"]] = new_df.apply(precision_recall_f1_per_row, axis=1)

In [7]:
new_df.head(100)

Unnamed: 0,id,ingredients,output_set,test_set,difference,intersection_size,union_size,accuracy,jaccard,precision,recall,f1
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{vanilla, pecan, margarine, rice biscuit, brow...","{vanilla, pecan, margarine, rice biscuit, brow...",{},7,7,1.000000,1.000000,1.000000,1.000000,1.000000
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{chicken breast, cream of mushroom soup, sour ...","{chicken breast, cream of mushroom soup, sour ...",{},4,4,1.000000,1.000000,1.000000,1.000000,1.000000
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{corn, salt, black pepper, garlic powder, butt...","{corn, salt, black pepper, garlic powder, butt...",{},6,6,1.000000,1.000000,1.000000,1.000000,1.000000
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{cheese, cream of mushroom soup, whole chicken...","{cheese, cream of mushroom soup, whole chicken...",{},5,5,1.000000,1.000000,1.000000,1.000000,1.000000
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{graham cracker crumb, powdered sugar, chocola...","{graham cracker, powdered sugar, chocolate chi...",{graham cracker crumb},4,6,0.800000,0.666667,0.800000,0.800000,0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,"['3 (1 lb.) cans pork and beans', '1/2 c. bell...","{oil, bell pepper, pork and beans, catsup, bro...","{oil, bell pepper, pork and beans, catsup, bro...",{},9,9,1.000000,1.000000,1.000000,1.000000,1.000000
96,96,"['2 c. diced cooked chicken', '2 Tbsp. shorten...","{shortening, water, soy sauce, carrot, brown s...","{shortening, water, chicken bouillon, soy sauc...",{chicken bouillon cube},13,15,0.928571,0.866667,0.928571,0.928571,0.928571
97,97,"['8 to 10 juicy oranges, peeled and diced', '1...","{pecan, orange, coconut, orange juice, sugar, ...","{pecan, orange, coconut, orange juice, sugar, ...",{},6,6,1.000000,1.000000,1.000000,1.000000,1.000000
98,98,"['1 c. creamy peanut butter', '1 c. sugar', '1...","{sugar, peanut butter, egg}","{sugar, peanut butter, egg}",{},3,3,1.000000,1.000000,1.000000,1.000000,1.000000


In [8]:
print(f"Accuracy:  {new_df['accuracy'].mean():.4f}")
print(f"Jaccard:   {new_df['jaccard'].mean():.4f}")
print(f"Precision: {new_df['precision'].mean():.4f}")
print(f"Recall:    {new_df['recall'].mean():.4f}")
print(f"F1:        {new_df['f1'].mean():.4f}")

Accuracy:  0.8669
Jaccard:   0.8056
Precision: 0.8683
Recall:    0.8733
F1:        0.8705
