In [1]:
import pandas as pd

In [2]:
def normalize(x):
    x = x.strip()
    # remove only the FIRST { and LAST }
    if x.startswith("{") and x.endswith("}"):
        x = x[1:-1]
    x = x.strip().strip('"').strip("'")
    return {x}

In [3]:
# Read the CSV
pred_df = pd.read_csv("data/output/processed_products.csv")
test_df = pd.read_csv("data/test/TEST_food_branded.csv")
# Rename the test column so we don't clash on merge
test_df_renamed = test_df.rename(columns={"mapped_ingredient": "mapped_ingredient_test"})
# Merge using test_df as the left df to keep its order
merged_df = test_df_renamed[["fdc_id", "mapped_ingredient_test"]].merge(
    pred_df, on="fdc_id", how="inner"
)
merged_df.head()

Unnamed: 0,fdc_id,mapped_ingredient_test,description,mapped_ingredient
0,167512,buttermilk biscuit,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...","""biscuit mix"""
1,167513,cinnamon roll,"Pillsbury, Cinnamon Rolls with Icing, refriger...","""cinnamon roll"""
2,167514,bread crumb,"Kraft Foods, Shake N Bake Original Recipe, Coa...","""breadcrumb"""
3,167515,english muffin,"George Weston Bakeries, Thomas English Muffins","""english muffin"""
4,167516,buttermilk waffle,"Waffles, buttermilk, frozen, ready-to-heat","""buttermilk"""


In [4]:
pred_sets = [normalize(x) for x in merged_df["mapped_ingredient"]]
test_sets = [normalize(x) for x in merged_df["mapped_ingredient_test"]]

new_df = pd.DataFrame({
    "fdc_id": merged_df["fdc_id"],
    "description": merged_df["mapped_ingredient"],
    "pred_set": pred_sets,
    "test_set": test_sets,
})
new_df.head(300)

Unnamed: 0,fdc_id,description,pred_set,test_set
0,167512,"""biscuit mix""",{biscuit mix},{buttermilk biscuit}
1,167513,"""cinnamon roll""",{cinnamon roll},{cinnamon roll}
2,167514,"""breadcrumb""",{breadcrumb},{bread crumb}
3,167515,"""english muffin""",{english muffin},{english muffin}
4,167516,"""buttermilk""",{buttermilk},{buttermilk waffle}
...,...,...,...,...
295,167807,"""cherry juice""",{cherry juice},{cherry juice}
296,167808,"""raspberry""",{raspberry},{raspberry puree}
297,167809,"""raspberry""",{raspberry},{raspberry puree}
298,167810,"""pork""",{pork},{pork}


In [5]:
def precision_recall_f1_per_row(row):
    pred = row["pred_set"]
    true = row["test_set"]

    tp = len(pred & true)
    fp = len(pred - true)
    fn = len(true - pred)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return pd.Series({"precision": precision, "recall": recall, "f1": f1})


def calculate_overlap(row):
    intersection_size = len(row["pred_set"] & row["test_set"])
    n = max(len(row["pred_set"]), len(row["test_set"]))
    if n == 0: return 0
    return intersection_size / n


def calculate_jaccard(row):
    intersection_size = row['intersection_size']
    union_size = row['union_size']
    if union_size == 0: return 0
    return intersection_size / union_size

In [6]:
# intersection size
new_df["intersection_size"] = new_df.apply(
    lambda row: len(row["pred_set"] & row["test_set"]), axis=1
)
# union size
new_df["union_size"] = new_df.apply(
    lambda row: len(row["pred_set"] | row["test_set"]), axis=1
)
# overlap = intersection / larger_set
new_df["overlap"] = new_df.apply(calculate_overlap, axis=1)
# Jaccard similarity (intersection / union)
new_df["jaccard"] = new_df.apply(calculate_jaccard, axis=1)
# Precision, recall, F1
new_df[["precision", "recall", "f1"]] = new_df.apply(precision_recall_f1_per_row, axis=1)

In [7]:
new_df.head(100)

Unnamed: 0,fdc_id,description,pred_set,test_set,intersection_size,union_size,overlap,jaccard,precision,recall,f1
0,167512,"""biscuit mix""",{biscuit mix},{buttermilk biscuit},0,2,0.0,0.0,0.0,0.0,0.0
1,167513,"""cinnamon roll""",{cinnamon roll},{cinnamon roll},1,1,1.0,1.0,1.0,1.0,1.0
2,167514,"""breadcrumb""",{breadcrumb},{bread crumb},0,2,0.0,0.0,0.0,0.0,0.0
3,167515,"""english muffin""",{english muffin},{english muffin},1,1,1.0,1.0,1.0,1.0,1.0
4,167516,"""buttermilk""",{buttermilk},{buttermilk waffle},0,2,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
95,167607,"""raw meat""",{raw meat},{moose},0,2,0.0,0.0,0.0,0.0,0.0
96,167608,"""mashu root""",{mashu root},{mashu root},1,1,1.0,1.0,1.0,1.0,1.0
97,167609,"""liver""",{liver},{moose liver},0,2,0.0,0.0,0.0,0.0,0.0
98,167610,"""bearded seal meat""",{bearded seal meat},{seal},0,2,0.0,0.0,0.0,0.0,0.0


In [8]:
print(f"Overlap:   {new_df['overlap'].mean():.4f} (Intersection divided by the size of the larger set)")
print(f"Jaccard:   {new_df['jaccard'].mean():.4f} (Intersection divided by the size of the union)")
print(f"Precision: {new_df['precision'].mean():.4f} (Fraction of predicted ingredients that are correct)")
print(f"Recall:    {new_df['recall'].mean():.4f} (Fraction of actual ingredients that were predicted)")
print(f"F1:        {new_df['f1'].mean():.4f} (Harmonic mean of precision and recall)")

Overlap:   0.4633 (Intersection divided by the size of the larger set)
Jaccard:   0.4633 (Intersection divided by the size of the union)
Precision: 0.4633 (Fraction of predicted ingredients that are correct)
Recall:    0.4633 (Fraction of actual ingredients that were predicted)
F1:        0.4633 (Harmonic mean of precision and recall)


In [9]:
len(new_df)

300