In [1]:
import pandas as pd

In [2]:
def normalize(x):
    x = x.strip()
    # remove only the FIRST { and LAST }
    if x.startswith("{") and x.endswith("}"):
        x = x[1:-1]
    x = x.strip().strip('"').strip("'")
    return x

In [3]:
# Read the CSV
pred_df = pd.read_csv("data/output/processed_products.csv")
test_df = pd.read_csv("data/test/TEST_food_branded.csv")
# Rename the test column so we don't clash on merge
test_df_renamed = test_df.rename(columns={"mapped_ingredient": "mapped_ingredient_test"})
# Merge using test_df as the left df to keep its order
merged_df = test_df_renamed[["fdc_id", "mapped_ingredient_test"]].merge(
    pred_df, on="fdc_id", how="inner"
)
merged_df.head()

Unnamed: 0,fdc_id,mapped_ingredient_test,description,mapped_ingredient
0,167512,buttermilk biscuit,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...","""biscuit mix"""
1,167513,cinnamon roll,"Pillsbury, Cinnamon Rolls with Icing, refriger...","""cinnamon roll"""
2,167514,bread crumb,"Kraft Foods, Shake N Bake Original Recipe, Coa...","""breadcrumb"""
3,167515,english muffin,"George Weston Bakeries, Thomas English Muffins","""english muffin"""
4,167516,buttermilk waffle,"Waffles, buttermilk, frozen, ready-to-heat","""buttermilk"""


In [4]:
pred_sets = [normalize(x) for x in merged_df["mapped_ingredient"]]
test_sets = [normalize(x) for x in merged_df["mapped_ingredient_test"]]

new_df = pd.DataFrame({
    "fdc_id": merged_df["fdc_id"],
    "description": merged_df["description"],
    "pred": pred_sets,
    "test": test_sets,
})
new_df.head(300)

Unnamed: 0,fdc_id,description,pred,test
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",biscuit mix,buttermilk biscuit
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon roll,cinnamon roll
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",breadcrumb,bread crumb
3,167515,"George Weston Bakeries, Thomas English Muffins",english muffin,english muffin
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",buttermilk,buttermilk waffle
...,...,...,...,...
295,167807,"Cherry juice, tart",cherry juice,cherry juice
296,167808,"Raspberries, puree, seedless",raspberry,raspberry puree
297,167809,"Raspberries, puree, with seeds",raspberry,raspberry puree
298,167810,"Pork, fresh, composite of trimmed leg, loin, s...",pork,pork


In [5]:
def accuracy(row):
    pred = row["pred"]
    true = row["test"]
    if pred == true:
        return 1
    else:
        return 0

In [6]:
new_df["accuracy"] = new_df.apply(accuracy, axis=1)

In [11]:
new_df.head(300)

Unnamed: 0,fdc_id,description,pred,test,accuracy
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",biscuit mix,buttermilk biscuit,0
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon roll,cinnamon roll,1
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",breadcrumb,bread crumb,0
3,167515,"George Weston Bakeries, Thomas English Muffins",english muffin,english muffin,1
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",buttermilk,buttermilk waffle,0
...,...,...,...,...,...
295,167807,"Cherry juice, tart",cherry juice,cherry juice,1
296,167808,"Raspberries, puree, seedless",raspberry,raspberry puree,0
297,167809,"Raspberries, puree, with seeds",raspberry,raspberry puree,0
298,167810,"Pork, fresh, composite of trimmed leg, loin, s...",pork,pork,1


In [8]:
print(f"Accuracy:  {new_df['accuracy'].mean():.4f} (Total Correct Rows / Total Rows)")

Accuracy:  0.4633 (Total Correct Rows / Total Rows)


In [9]:
len(new_df)

300