In [586]:
import pandas as pd
import csv
import numpy as np

In [587]:
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [588]:
skip_na_values = ['unknown', 'NaN', '-', '...', 'not-applicable', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'None', 'n/a', 'nan', 'null ']

In [589]:
selected_cols = ['product_name',
 'additives_n',
 'nutriscore_score',
 'nutriscore_grade',
 'nova_group',
 'pnns_groups_1',
 'pnns_groups_2',
 'ecoscore_score',
 'ecoscore_grade',
 'nutrient_levels_tags',
 'energy-kcal_100g',
 'fat_100g',
 'saturated-fat_100g',
 'trans-fat_100g',
 'cholesterol_100g',
 'carbohydrates_100g',
 'sugars_100g',
 'fiber_100g',
 'proteins_100g',
 'salt_100g',
 'sodium_100g',
 'alcohol_100g',
 'vitamin-a_100g',
 'vitamin-d_100g',
 'vitamin-c_100g',
 'potassium_100g',
 'calcium_100g',
 'iron_100g',
 'fruits-vegetables-nuts-estimate-from-ingredients_100g',
 'nutrition-score-fr_100g']

In [590]:
# Returns a TextFileReader, which is iterable with chunks of 1000 rows
df = pd.DataFrame()
filter_subset=['product_name', 'nutrition-score-fr_100g', 'nutriscore_score', 'energy-kcal_100g', 'nova_group']
for chunk in  pd.read_csv('./data.csv', on_bad_lines='skip', engine='c', sep='\t', low_memory=False, usecols=selected_cols, na_values=skip_na_values, iterator=True, quotechar='"', chunksize=300000):
    filtered_chunk = chunk.dropna(subset=filter_subset, how='any')
    df = pd.concat([df, filtered_chunk], ignore_index=True)

In [591]:
df_stage = df.drop_duplicates()

In [592]:
lst = ['fat_100g', 'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 
                        'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-c_100g',
                        'potassium_100g', 'calcium_100g', 'iron_100g', 'ecoscore_score', 'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'nova_group']
mean_df = df_stage[lst].mean()

median_df = df_stage[lst].median()

mode_df = df_stage[lst].mode().iloc[0]

central_tendencies = pd.DataFrame({
    'Mean': mean_df,
    'Median': median_df,
    'Mode': mode_df
})

central_tendencies

Unnamed: 0,Mean,Median,Mode
fat_100g,6993189.0,7.06,0.0
saturated-fat_100g,5.533334,2.0,0.0
trans-fat_100g,0.04598617,0.0,0.0
cholesterol_100g,0.04374451,0.0,0.0
carbohydrates_100g,3926935000000.0,18.0,0.0
sugars_100g,9403760000000.0,4.5,0.0
fiber_100g,1071414000000.0,1.6,0.0
proteins_100g,8.802015e+19,5.3,0.0
salt_100g,1.092805e+39,0.5326,0.0
sodium_100g,4.371219e+38,0.213,0.0


In [593]:
cols_search_outliers = ['fat_100g', 'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 
                        'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-c_100g',
                        'potassium_100g', 'calcium_100g', 'iron_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g']

In [594]:
mask = np.any((df_stage[cols_search_outliers] < 0) | (df_stage[cols_search_outliers] > 100), axis=1)

# Filter the DataFrame using the mask
df_stage = df_stage[~mask]

# Reset the index of the filtered DataFrame
df_stage.reset_index(drop=True, inplace=True)

In [595]:
cols_to_mode = ['nutriscore_grade', 'ecoscore_grade', 'additives_n', 'nutrient_levels_tags']

In [596]:
cols_to_unknown = ['pnns_groups_1', 'pnns_groups_2']

In [597]:
fill_median_list = ['fat_100g', 'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g', 'carbohydrates_100g', 'sugars_100g', 
                        'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'alcohol_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-c_100g',
                        'potassium_100g', 'calcium_100g', 'iron_100g', 'ecoscore_score',  'nutriscore_score', 'fruits-vegetables-nuts-estimate-from-ingredients_100g']

In [598]:
df_stage['nutriscore_score'] = pd.to_numeric(df_stage['nutriscore_score'], errors='coerce')

In [599]:
for col in fill_median_list:
    df_stage[col] = df_stage[col].fillna(df_stage[col].median())

for col in cols_to_mode:
    df_stage[col] = df_stage[col].fillna(df_stage[col].mode()[0])
    
df_stage[cols_to_unknown] = df_stage[cols_to_unknown].fillna(value='unknown')

In [604]:
df_stage.to_csv('./food_data.csv', index=False, sep='\t')