In [1]:
import ast
import json

import pandas as pd

In [2]:
# convert "['a','b']" to a set {'a', 'b'}
def to_set(x):
    list_string = json.loads(x)
    list_raw = ast.literal_eval(list_string)
    return set(list_raw)

In [3]:
# Read the CSV
pred_df = pd.read_csv("data/output/processed_ingredients.csv")
test_df = pd.read_csv("data/test/TEST_FoodKG_ingredients_normalized.csv")
# Rename the test column so we don't clash on merge
test_df_renamed = test_df.rename(columns={"ingredients_normalized": "ingredients_test"})
# Merge using test_df as the left df to keep its order
merged_df = test_df_renamed[["id", "ingredients_test"]].merge(
    pred_df, on="id", how="inner"
)
merged_df.head()

Unnamed: 0,id,ingredients_test,title,ingredients,ingredients_normalized
0,0,"""['brown sugar', 'evaporated milk', 'vanilla',...",No-Bake Nut Cookies,"['1 c. firmly packed brown sugar', '1/2 c. eva...","""['brown sugar', 'evaporated milk', 'vanilla',..."
1,1,"""['chipped beef', 'chicken breast', 'cream of ...",Jewell Ball'S Chicken,"['1 small jar chipped beef, cut up', '4 boned ...","""['chipped beef', 'chicken breast', 'cream of ..."
2,2,"""['corn', 'cream cheese', 'butter', 'garlic po...",Creamy Corn,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","""['corn', 'cream cheese', 'butter', 'garlic po..."
3,3,"""['whole chicken', 'chicken gravy', 'cream of ...",Chicken Funny,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","""['chicken', 'chicken gravy', 'mushroom soup',..."
4,4,"""['peanut butter', 'graham cracker', 'butter',...",Reeses Cups(Candy),"['1 c. peanut butter', '3/4 c. graham cracker ...","""['peanut butter', 'graham cracker crumb', 'bu..."


In [4]:
pred_sets = [to_set(x) for x in merged_df["ingredients_normalized"]]
test_sets = [to_set(x) for x in merged_df["ingredients_test"]]

new_df = pd.DataFrame({
    "id": merged_df["id"],
    "ingredients": merged_df["ingredients"],
    "pred_set": pred_sets,
    "test_set": test_sets,
})
new_df.head()

Unnamed: 0,id,ingredients,pred_set,test_set
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{pecan, butter, shredded rice biscuit, brown s...","{pecan, margarine, butter, rice biscuit, brown..."
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{chipped beef, cream of mushroom soup, sour cr...","{chipped beef, cream of mushroom soup, sour cr..."
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{salt, pepper, corn, butter, garlic powder, cr...","{black pepper, salt, corn, butter, garlic powd..."
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{cheese, stuffing, mushroom soup, chicken, chi...","{cheese, stuffing, cream of mushroom soup, who..."
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{peanut butter, butter, chocolate chip, graham...","{graham cracker, peanut butter, butter, chocol..."


In [5]:
def precision_recall_f1_per_row(row):
    pred = row["pred_set"]
    test = row["test_set"]

    tp = len(pred & test)
    fp = len(pred - test)
    fn = len(test - pred)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return pd.Series({"precision": precision, "recall": recall, "f1": f1})


def calculate_overlap(row):
    intersection_size = len(row["pred_set"] & row["test_set"])
    n = max(len(row["pred_set"]), len(row["test_set"]))
    if n == 0: return 0
    return intersection_size / n


def calculate_jaccard(row):
    intersection_size = row['intersection_size']
    union_size = row['union_size']
    if union_size == 0: return 0
    return intersection_size / union_size

In [6]:
# Elements that differ
new_df["pred_only"] = new_df["pred_set"] - new_df["test_set"]
new_df["test_only"] = new_df["test_set"] - new_df["pred_set"]
# intersection size
new_df["intersection_size"] = new_df.apply(
    lambda row: len(row["pred_set"] & row["test_set"]), axis=1
)
# union size
new_df["union_size"] = new_df.apply(
    lambda row: len(row["pred_set"] | row["test_set"]), axis=1
)
# overlap = intersection / larger_set
new_df["overlap"] = new_df.apply(calculate_overlap, axis=1)
# Jaccard similarity (intersection / union)
new_df["jaccard"] = new_df.apply(calculate_jaccard, axis=1)
# Precision, recall, F1
new_df[["precision", "recall", "f1"]] = new_df.apply(precision_recall_f1_per_row, axis=1)

In [7]:
new_df.head(100)

Unnamed: 0,id,ingredients,pred_set,test_set,pred_only,test_only,intersection_size,union_size,overlap,jaccard,precision,recall,f1
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","{pecan, butter, shredded rice biscuit, brown s...","{pecan, margarine, butter, rice biscuit, brown...",{shredded rice biscuit},"{margarine, rice biscuit}",5,8,0.714286,0.625000,0.833333,0.714286,0.769231
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","{chipped beef, cream of mushroom soup, sour cr...","{chipped beef, cream of mushroom soup, sour cr...",{},{},4,4,1.000000,1.000000,1.000000,1.000000,1.000000
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","{salt, pepper, corn, butter, garlic powder, cr...","{black pepper, salt, corn, butter, garlic powd...",{pepper},{black pepper},5,7,0.833333,0.714286,0.833333,0.833333,0.833333
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","{cheese, stuffing, mushroom soup, chicken, chi...","{cheese, stuffing, cream of mushroom soup, who...","{mushroom soup, chicken}","{cream of mushroom soup, whole chicken}",3,7,0.600000,0.428571,0.600000,0.600000,0.600000
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","{peanut butter, butter, chocolate chip, graham...","{graham cracker, peanut butter, butter, chocol...",{graham cracker crumb},{graham cracker},4,6,0.800000,0.666667,0.800000,0.800000,0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,"['3 (1 lb.) cans pork and beans', '1/2 c. bell...","{onion, salt, catsup, black pepper, bell peppe...","{onion, salt, catsup, black pepper, bell peppe...",{},{},9,9,1.000000,1.000000,1.000000,1.000000,1.000000
96,96,"['2 c. diced cooked chicken', '2 Tbsp. shorten...","{onion, cornstarch, ginger, catsup, soy sauce,...","{onion, cornstarch, ginger, catsup, soy sauce,...",{},{},14,14,1.000000,1.000000,1.000000,1.000000,1.000000
97,97,"['8 to 10 juicy oranges, peeled and diced', '1...","{pecan, orange juice, orange, sugar, cherry, c...","{pecan, orange juice, orange, sugar, cherry, c...",{},{},6,6,1.000000,1.000000,1.000000,1.000000,1.000000
98,98,"['1 c. creamy peanut butter', '1 c. sugar', '1...","{sugar, peanut butter, egg}","{sugar, peanut butter, egg}",{},{},3,3,1.000000,1.000000,1.000000,1.000000,1.000000


In [9]:
print(f"Overlap:   {new_df['overlap'].mean():.4f} (Intersection divided by the size of the larger set)")
print(f"Jaccard:   {new_df['jaccard'].mean():.4f} (Intersection divided by the size of the union)")
print(f"Precision: {new_df['precision'].mean():.4f} (Fraction of predicted ingredients that are correct)")
print(f"Recall:    {new_df['recall'].mean():.4f} (Fraction of actual ingredients that were predicted)")
print(f"F1:        {new_df['f1'].mean():.4f} (Harmonic mean of precision and recall)")

Overlap:   0.8744 (Intersection divided by the size of the larger set)
Jaccard:   0.8104 (Intersection divided by the size of the union)
Precision: 0.8834 (Fraction of predicted ingredients that are correct)
Recall:    0.8756 (Fraction of actual ingredients that were predicted)
F1:        0.8791 (Harmonic mean of precision and recall)
