<a href="https://colab.research.google.com/github/ChronoBoot/sante-publique-france-off-enhancer/blob/test/notebook/open_food_fact_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data prepartion for Open Food Facts

## File loading and main class

In [None]:
import pandas as pd

off_df = pd.read_csv('fr.openfoodfacts.org.products.csv', delimiter = '\t', low_memory=False)

In [None]:
print(f"Number of products {len(off_df)}")

In [None]:
class DataFrameField:
    @staticmethod
    def process_na_percent(col, set_na_percent=False):
      na_percent = col.isna().mean() * 100
      return na_percent

    def __init__(self, name, dtype=None, na_percent=None, field_type=None):
        """
        Initialize a DataFrameField object with only the name required at creation.
        Other attributes can be set later.

        Parameters:
        - name (str): The name of the DataFrame column.
        - dtype (str, optional): The data type of the column. Defaults to None.
        - na_percent (float, optional): The percentage of missing (NA) values in the column. Defaults to None.
        - field_type (str, optional): Specifies whether the field is 'categorical' or 'numerical'. Defaults to None.
        """
        self.name = name
        self.dtype = dtype
        self.na_percent = na_percent
        # The percentage of non-mising values in the column
        self.fill_percent = 100 - na_percent if na_percent is not None else None
        self.field_type = field_type

    def __repr__(self):
        """
        Provide a string representation of the DataFrameField instance.
        """
        return (f"DataFrameField(name='{self.name}', dtype='{self.dtype}', "
                f"na_percent='{self.na_percent}', fill_percent='{self.fill_percent}',"
                f"field_type='{self.field_type}')")

    def display_info(self):
        """
        Print out the details of the DataFrameField instance.
        """
        print(f"Column Name: {self.name}")
        if self.dtype:
            print(f"Data Type: {self.dtype}")
        if self.na_percent is not None:
            print(f"Percentage of Missing Values: {self.na_percent}%")
        if self.fill_percent is not None :
            print(f"Percent of Non-Missing Values: {self.fill_percent}")
        if self.field_type:
            print(f"Field Type: {self.field_type}")

    # Additional methods to set attributes after object creation
    def set_dtype(self, dtype):
        self.dtype = dtype

    def set_na_percent(self, na_percent):
        self.na_percent = na_percent
        self.fill_percent = 100 - na_percent

    def set_field_type(self, field_type):
        self.field_type = field_type

    def process_and_set_na_percent(self, col):
        na_percent = DataFrameField.process_na_percent(col)
        self.set_na_percent(na_percent)



## Step 1 : Cleaning and filtering of features and products

### Feature listing

Split the columns attributes in 2, categorical and numerical

In [None]:
def feature_listing(off_df):
  categorical_columns = []
  numerical_columns = []

  for name, dtype in off_df.dtypes.items():
    na_percent = DataFrameField.process_na_percent(off_df[name])

    if(dtype == 'int64' or dtype == 'float64'):
      df_col = DataFrameField(name, dtype, na_percent, field_type='numerical')
      numerical_columns.append(df_col)
    else:
      df_col = DataFrameField(name, dtype, na_percent, field_type='categorical')
      categorical_columns.append(df_col)

  df_fields = categorical_columns + numerical_columns

  return categorical_columns, numerical_columns, df_fields

In [None]:
categorical_columns, numerical_columns, df_fields = feature_listing(off_df)

print("########## Categorical columns ##########")

for col in categorical_columns :
  print(col)

print("#######################################")

print("##########  Numerical columns ##########")

for col in numerical_columns :
  print(col)

print("#######################################")

### Find a target

We find which columns have more than 50% of values missing and we return the one with the most amount of non-missing values.
We will prefer to have the target to be as close a 50% missing value as possible because with too little values, prediction is going to be harder and less reliable

In [None]:
def find_target(columns):
  candidates = []
  for col in columns :
    if col.na_percent > 50 :
      candidates.append(col)

  return candidates

def get_recommanded_target(columns, nb_choice = 1):
  candidates = find_target(columns)
  targets = []

  for i in range (0, nb_choice):
    target = min(candidates, key = lambda candidate : candidate.na_percent)
    targets.append(target)
    candidates.remove(target)

  return targets


In [None]:
candidates = find_target(categorical_columns)

nb_candidates = len(candidates)

print(f"Number of candidates : {nb_candidates}")

targets = get_recommanded_target(categorical_columns, 10)

for target in targets :
  print(target)

In [None]:
target_name = "pnns_groups_1"

The chosen target is pnns_group_1 which most like stands for PNNS (Programme National Nutrition Santé) which categorize foods.

### Remove rows without the target value

In [None]:
def filtered_row(df, target_name):
  filtered_off_df = df[df[target_name].notna()]
  return filtered_off_df

In [None]:
filtered_off_df = filtered_row(off_df, target_name)
print(f"Number of product with PNNS indicated : {len(filtered_off_df)}")

### Separate target from dataset

In [None]:
def separate_target_from_dataset(target_name, df, df_fields):
  target_col = df[target_name]
  filtered_off_df = df.drop(target_name, axis=1)
  df_fields = [field for field in df_fields if field.name != target_name]

  return target_col, filtered_off_df, df_fields

In [None]:
target_col, filtered_off_df, df_fields = separate_target_from_dataset(target_name, filtered_off_df, df_fields)

### Display the fill rates of the features of the dataset

In [None]:
def display_rate(df_fields):
  for field in df_fields :
    field.process_and_set_na_percent(filtered_off_df[field.name])
    print(f"Fill rate {field.name} : {field.fill_percent}")

In [None]:
display_rate(df_fields)

### Feature selection

We select features with more than 50% of non-missing values which could be use for predicting our target

In [None]:
def features_selection(df_fields):
  features = []

  for field in df_fields:
    if(field.fill_percent > 50):
      features.append(field)

  return features

In [None]:
features = features_selection(df_fields)

for feature in features :
  print(feature)

The features that have been selected are as follow :

- ingredients_text
- additives
- additives_n
- ingredients_from_palm_oil_n
- ingredients_that_may_be_from_palm_oil_n
- energy_100g
- fat_100g
- saturated-fat_100g
- carbohydrates_100g
- sugars_100g
- fiber_100g
- proteins_100g
- salt_100g
- sodium_100g

They are focus on the ingredients or nutrition facts

### Delete duplicates

We consider that 2 products with the same name are a duplicate

In [168]:
def delete_duplicates(df, identifier_name):
   df[identifier_name] = df[identifier_name].str.lower().str.strip().str.replace(" ", "")
   no_duplicate_df = df.drop_duplicates(subset=identifier_name)
   return no_duplicate_df

In [165]:
identifier_name = 'product_name'

filtered_off_no_duplicates = delete_duplicates(filtered_off_df, identifier_name)

### Main function to clean and filter features and products

In [171]:
def main(df):
  categorical_columns, numerical_columns, df_fields = feature_listing(df)

  candidates = find_target(categorical_columns)

  target_name = "pnns_groups_1"

  filtered_off_df = filtered_row(df, target_name)
  print(f"Number of lines with {target_name} : {len(filtered_off_df)}")

  target_col, filtered_off_df, df_fields = separate_target_from_dataset(target_name, filtered_off_df, df_fields)

  #display_rate(df_fields)

  features = features_selection(df_fields)

  identifier_name = 'product_name'

  filtered_off_no_duplicates = delete_duplicates(filtered_off_df, identifier_name)
  print(f"Number of lines with {target_name} and without duplicates: {len(filtered_off_no_duplicates)}")


In [172]:
main(off_df)

## Step 2 : Identify and process incoherent values

## Step 3 : Identify and process missing values

## Step 4 : Perform univariate and bivariate analyses

## Step 5 : Perform a multivariate analysis