In [None]:
#————————————————————

# Name: Data Cleansing and Dedulpication of Openfoodfacts (V1)

# Purpose:


# Company: Allgeier Schweiz AG
# Author: Nicolas Rehder (nrehder@allgeier.ch)
# Create for: SDSC 2024
# Date Created: 22.01.2024
# Last Updated: 22.01.2024
# Python Version: 3.10.4

# General Sources:

# Additionals:

# Download Python packages (run the below command in terminal if packages have not yet been installed)
# pip install -r C:\Python\sdsc\requirements.txt

#————————————————————

In [3]:
# Import required libraries
import os
import io
import time
from io import StringIO
import json
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
from openai import AzureOpenAI
import json
from IPython.display import clear_output
from IPython.core.display import HTML
import requests
import random
import recordlinkage
from recordlinkage.index import Block
from recordlinkage.index import SortedNeighbourhood
import recordlinkage


In [4]:
# Import openfoodfacts csv

path_input = r"C:\Python\data\en.openfoodfacts.org.products.csv" #Change path if required
df = pd.read_csv(path_input , sep='\t', on_bad_lines='skip', low_memory=False)

In [5]:
def clean_data(data_df):
    """Split the openfoodfacts dataframe into food and beverage datasets and remove unnecessary features"""

    data_df = data_df[["product_name", "allergens", "categories_tags", "countries_en", "nutriscore_grade", "energy-kcal_100g", "fat_100g", "saturated-fat_100g", "carbohydrates_100g", "sugars_100g", "proteins_100g"]]
    #data_df = data_df.rename(columns={"energy-kcal_100g": "energy", "fat_100g": "fat", "saturated-fat_100g": "saturated_fat", "carbohydrates_100g": "carbohydrates", "sugars_100g": "sugars", "proteins_100g": "proteins"})
    
    data_df = data_df.dropna(subset=['product_name'])
    data_df = data_df.drop_duplicates(subset ='product_name')
    data_df = data_df.dropna(subset=['categories_tags'])
    data_df = data_df.dropna(subset=['energy-kcal_100g'])
    data_df = data_df.loc[data_df['categories_tags'].str.contains("^en:", regex = True, case=False)]
    data_df = data_df[data_df.countries_en.isin(["United Kingdom", "United States"])]
    data_df = data_df[~data_df['product_name'].str.match(r'.*[^\x00-\xFF]')]
    data_df["product_name"] = data_df["product_name"].str.replace(r'(^.*?,)', '', regex=True)
    data_df["product_name"] = data_df["product_name"].str.replace(r'(\d+)%', '', regex=True)
    data_df["product_name"] = data_df["product_name"].str.replace(r'\s+', ' ', regex=True)
    data_df = data_df.reset_index(drop=True)

    categories_tags = data_df["categories_tags"].str.split(",", n = 5, expand = True)
    categories_tags = categories_tags[[0,1,2]]
    categories_tags.rename(columns = {0:'level_1', 1:'level_2', 2:'level_3'}, inplace = True)

    categories_tags["level_1"] = categories_tags["level_1"].str.replace(r'^en:', '', regex=True)
    categories_tags["level_2"] = categories_tags["level_2"].str.replace(r'^en:', '', regex=True)
    categories_tags["level_3"] = categories_tags["level_3"].str.replace(r'^en:', '', regex=True)

    data_df = data_df.join(categories_tags)
    data_df = data_df.drop(["categories_tags"], axis = 1)

    data_df.replace("", float("NaN"), inplace=True)
    data_df = data_df.dropna(subset=['product_name'])
    data_df = data_df.dropna(subset=['level_2'])


    #Food

    # data_df = data_df[data_df.level_1.isin(["plant-based-foods-and-beverages", "plant-based foods", "dairies", "meats", "seafood", "sweeteners", "farming-products"])]
    # data_df = data_df[~data_df.level_2.isin(["beverages", "whey-powder", "baby-foods", "evaporated-milks", "dietary-supplements"])]
    # data_df = data_df[~data_df.level_3.isin(["simple-syrups"])]
    data_df = data_df[data_df.level_3.isin(["fruits-and-vegetables-based-foods", "dairy-substitutes", "groceries", "condiments", "legumes-and-their-products", "simple-syrups", "cereals-and-potatoes"])]


    # data_df = data_df.drop_duplicates(subset = "product_name")
    # data_df["product_name"] = data_df["product_name"].str.upper()

 
    #data_df["product_name"] = data_df["product_name"].str.split().str[:4].str.join(sep=" ")
    data_df = data_df[data_df['product_name'].map(lambda x: x.isascii())]
    data_df = data_df.reset_index(drop=True)


    return (data_df)

In [13]:
openfoodfacts = clean_data(df)

In [10]:
def deduplicate(data_df):
    """Removes duplicate products based on four different columns. The algorithm produces a dataset that is approx. 1/5 the size of the original"""


    indexer = recordlinkage.Index()
    indexer.add(SortedNeighbourhood(left_on='product_name', right_on='product_name'))
    # indexer.add(SortedNeighbourhood(left_on='level_2', right_on='level_2'))
    # indexer.add(SortedNeighbourhood(left_on='level_3', right_on='level_3'))

    candidates = indexer.index(data_df)

    compare = recordlinkage.Compare()
    compare.string('product_name', 'product_name', label='rl_product_name', threshold=0.80, method='jarowinkler')
    # compare.exact('level_2', 'level_2', label='rl_level_2')
    # compare.exact('level_3', 'level_3', label='rl_level_3')

    features = compare.compute(candidates, data_df)
    features['score'] = features['rl_product_name']
    full_matches = features.query('score>=1')
    data_df = data_df[~data_df.index.isin(full_matches.index.get_level_values(1))] 

    #Reset Index
    data_df = data_df.reset_index(drop=True)

    return (data_df)

In [11]:
openfoodfacts_dedup = deduplicate(clean_data(df))

In [12]:
#path_output = r"C:\Python\openai-lab\data\openfoodfacts.csv"
path_output = r"C:\Python\data\openfoodfacts.csv" #Change path if required

#df_products.to_excel(path_output, sheet_name='Sheet1', index = False)
openfoodfacts_dedup.to_csv(path_output, sep='\t', encoding='utf-8', index=False)
