#### `test_products`
Evaluates the model’s performance using accuracy metrics and Levenshtein similarity against the test dataset.

In [3]:
import pandas as pd
from rapidfuzz import fuzz

In [4]:
def normalize(x):
    x = x.strip()
    # remove only the FIRST { and LAST }
    if x.startswith("{") and x.endswith("}"):
        x = x[1:-1]
    x = x.strip().strip('"').strip("'")
    return x


def accuracy(row):
    pred = row["pred"]
    true = row["test"]
    if pred == true:
        return 1
    else:
        return 0


def levenshtein_similarity(row, threshold=0.5):
    pred = str(row["pred"])
    true = str(row["test"])

    score = fuzz.ratio(pred, true) / 100  # convert to 0–1 similarity

    return score if score >= threshold else 0.0

In [74]:
pred_df = pd.read_csv("../data/output/mapped_responses.csv")
pred_df.head(100)

Unnamed: 0,fdc_id,description,response
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",biscuits
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",breadcrumb
3,167515,"George Weston Bakeries, Thomas English Muffins",muffin
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",waffle
...,...,...,...
95,167607,"Moose, meat, raw (Alaska Native)",meat
96,167608,"Mashu roots, raw (Alaska Native)",root
97,167609,"Moose, liver, braised (Alaska Native)",liver
98,167610,"Seal, bearded (Oogruk), meat, raw (Alaska Native)",meat


In [8]:
test_df = pd.read_csv("../data/test/TEST_branded_food_product.csv")
test_df.head(100)

Unnamed: 0,fdc_id,description,mapped_ingredient
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",buttermilk biscuit
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon roll
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",bread crumb
3,167515,"George Weston Bakeries, Thomas English Muffins",english muffin
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",buttermilk waffle
...,...,...,...
95,167607,"Moose, meat, raw (Alaska Native)",moose
96,167608,"Mashu roots, raw (Alaska Native)",mashu root
97,167609,"Moose, liver, braised (Alaska Native)",moose liver
98,167610,"Seal, bearded (Oogruk), meat, raw (Alaska Native)",seal


In [76]:
# Merge using test_df as the left df to keep its order
merged_df = test_df[["fdc_id", "mapped_ingredient"]].merge(
    pred_df, on="fdc_id", how="inner"
)
merged_df.head()

Unnamed: 0,fdc_id,mapped_ingredient,description,response
0,167512,buttermilk biscuit,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",biscuits
1,167513,cinnamon roll,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon
2,167514,bread crumb,"Kraft Foods, Shake N Bake Original Recipe, Coa...",breadcrumb
3,167515,english muffin,"George Weston Bakeries, Thomas English Muffins",muffin
4,167516,buttermilk waffle,"Waffles, buttermilk, frozen, ready-to-heat",waffle


In [77]:
pred_sets = [normalize(x) for x in merged_df["response"]]
test_sets = [normalize(x) for x in merged_df["mapped_ingredient"]]

new_df = pd.DataFrame({
    "fdc_id": merged_df["fdc_id"],
    "description": merged_df["description"],
    "pred": pred_sets,
    "test": test_sets,
})
new_df.head(300)

Unnamed: 0,fdc_id,description,pred,test
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",biscuits,buttermilk biscuit
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon,cinnamon roll
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",breadcrumb,bread crumb
3,167515,"George Weston Bakeries, Thomas English Muffins",muffin,english muffin
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",waffle,buttermilk waffle
...,...,...,...,...
295,167807,"Cherry juice, tart",juice,cherry juice
296,167808,"Raspberries, puree, seedless",raspberry,raspberry puree
297,167809,"Raspberries, puree, with seeds",raspberry puree,raspberry puree
298,167810,"Pork, fresh, composite of trimmed leg, loin, s...",pork,pork


In [78]:
new_df["accuracy"] = new_df.apply(accuracy, axis=1)
new_df["levenshtein"] = new_df.apply(levenshtein_similarity, axis=1)

In [79]:
print(f"Accuracy:                {new_df['accuracy'].mean():.4f} (Total Correct Rows / Total Rows)\n")
print(
    f"Levenshtein Similarity:  {new_df['levenshtein'].mean():.4f} "
    f"(Avg similarity after removing pairs with <50% character overlap; "
    f"based on the minimal number of insertions, deletions, and substitutions "
    f"needed to transform one string into the other)"
)

Accuracy:                0.3800 (Total Correct Rows / Total Rows)

Levenshtein Similarity:  0.6282 (Avg similarity after removing pairs with <50% character overlap; based on the minimal number of insertions, deletions, and substitutions needed to transform one string into the other)
