### Identifier les valeurs aberrantes dans `products.csv`

Utiliser `pandas` pour charger et analyser les données du fichier CSV `products.csv`. 

Les anomalies suivantes sont recherchées :
- Doublons
- Valeurs négatives
- Valeurs supérieures à 100
- `saturated-fat_100g > fat_100g`
- `salt` et `sodium` non proportionnels
- Somme des colonnes nutritionnelles > 100 (avec tolérance d'arrondi)
- Somme des colonnes nutritionnelles != 100 (avec tolérance d'arrondi)

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

data_dir = Path("../../data")
file_path = data_dir / "products.csv"

df = pd.read_csv(file_path)

# Afficher les premières lignes pour vérifier le chargement des données
df.head()

  df = pd.read_csv(file_path)


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
0,0,3087,,,,,,,,100.0
1,1,4530,28.57,28.57,14.29,3.6,3.57,0.0,0.0,21.4
2,2,4559,17.86,0.0,17.86,7.1,17.86,0.635,0.25,38.435
3,3,16087,57.14,5.36,3.57,7.1,17.86,1.22428,0.482,7.26372
4,4,16094,1.43,,,5.7,8.57,,,84.3


In [2]:
# Identifier les doublons
duplicates = df[df.duplicated(keep=False)]
print(f"Nombre de doublons globaux : {len(duplicates)}")
print(duplicates)

duplicats_par_code = df[df.duplicated(subset=['code'], keep=False)]
print(f"Nombre de doublons de codes produits : {len(duplicats_par_code)}")
print(duplicats_par_code)

Nombre de doublons globaux : 0
Empty DataFrame
Columns: [Unnamed: 0, code, fat_100g, saturated-fat_100g, sugars_100g, fiber_100g, proteins_100g, salt_100g, sodium_100g, autre]
Index: []
Nombre de doublons de codes produits : 235
        Unnamed: 0         code  fat_100g  saturated-fat_100g  sugars_100g  \
6                6        16117       NaN                 NaN          NaN   
46              46        24600       NaN                 NaN          NaN   
62              62        31233      4.55                 NaN        23.64   
155            155        58001     46.43                5.36        10.71   
422            422   9800800056     26.92                9.62        44.23   
...            ...          ...       ...                 ...          ...   
320476      320476  94723500039      0.00                0.00        16.67   
320477      320477  94723500046      0.00                0.00        16.67   
320478      320478  94723500053      0.00                0.00        

In [3]:
# Identifier les valeurs négatives
negative_values = df[(df.select_dtypes(include=[np.number]) < 0).any(axis=1)]
print(f"Nombre de lignes avec des valeurs négatives : {len(negative_values)}")
negative_values

Nombre de lignes avec des valeurs négatives : 11


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
8582,8582,11213420608,0.0,0.0,-1.2,1.2,2.41,0.38354,0.151,97.05546
18209,18209,21130493432,0.8,0.0,-0.8,0.8,0.8,0.87376,0.344,97.18224
23784,23784,28400231053,33.33,13.33,0.0,-6.7,,6.43382,2.533,51.07318
33781,33781,36800416727,46.43,8.93,3.57,3.6,-3.57,0.99822,0.393,39.64878
115310,115310,4029816,0.0,,,,-500.0,25.4,10.0,564.6
117739,117739,608866999263,3.57,0.0,-3.57,3.6,7.14,0.9525,0.375,87.9325
146284,146284,789280259062,13.33,3.33,-6.67,6.7,,2.032,0.8,80.478
150858,150858,813922021028,6.25,1.25,-6.25,1.2,1.25,1.1938,0.47,94.6362
164030,164030,856336001538,21.43,3.57,-17.86,17.9,17.86,1.93294,0.761,54.40606
169119,169119,875208001230,0.0,,0.0,,-800.0,7.62,3.0,889.38


In [4]:
# Identifier les valeurs supérieures à 100
columns_to_check = ['fat_100g', 'saturated-fat_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g']
high_values = df[(df[columns_to_check] > 100).any(axis=1)]
print(f"Nombre de lignes avec des valeurs > 100 : {len(high_values)}")
high_values

Nombre de lignes avec des valeurs > 100 : 176


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
501,501,2124810206,0.00,,,,0.00,105.83418,41.667000,0.0
1435,1435,10300335290,45.71,8.57,5.71,5.7,17.14,870.85678,342.857000,0.0
3472,3472,11110825261,0.00,0.00,,,0.00,101.60000,40.000000,0.0
7037,7037,11161036074,0.00,,,,0.00,101.60000,40.000000,0.0
7983,7983,11213053264,0.77,0.00,2.31,3.8,4.62,781.53768,307.692000,0.0
...,...,...,...,...,...,...,...,...,...,...
303101,303101,8005305900255,101.00,15.00,0.00,,0.00,0.00000,0.000000,0.0
305054,305054,8032942610032,105.00,0.30,2.50,,12.00,0.01000,0.003937,0.0
305114,305114,8033224177274,0.00,,,,0.00,106.68000,42.000000,0.0
313506,313506,8710573641501,,550.00,3520.00,5380.0,430.00,0.01016,0.004000,0.0


In [6]:
# Vérifier si saturated-fat_100g > fat_100g
saturated_greater_fat = df[df['saturated-fat_100g'] > df['fat_100g']]
print(f"Nombre de lignes où 'saturated-fat_100g' > 'fat_100g' : {len(saturated_greater_fat)}")
saturated_greater_fat

Nombre de lignes où 'saturated-fat_100g' > 'fat_100g' : 354


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
2322,2322,11110372680,0.00,1.25,65.00,5.0,0.00,0.00000,0.000000,28.750000
3450,3450,11110823564,1.31,1.62,,,,0.06096,0.024000,96.985040
5839,5839,11150253642,9.21,21.05,47.37,2.6,2.63,0.43434,0.171000,16.534660
5840,5840,11150253659,9.21,21.05,47.37,2.6,2.63,0.43434,0.171000,16.534660
6320,6320,11150940290,1.79,5.36,0.00,0.0,3.57,1.04394,0.411000,87.825060
...,...,...,...,...,...,...,...,...,...,...
316996,316996,8851613101392,18.89,19.00,20.00,0.0,16.00,0.05000,0.019685,26.040315
317148,317148,8853662023236,0.00,2.00,8.50,,,0.75438,0.297000,88.448620
317432,317432,88936000463,0.88,4.42,1.77,0.0,15.93,0.89916,0.354000,75.746840
317655,317655,8935054704463,0.00,20.00,0.00,20.0,0.00,0.00000,0.000000,60.000000


In [7]:
# Vérifier si salt et sodium sont non proportionnels
salt_to_sodium_ratio = 2.54  # Rapport approximatif sel -> sodium
salt_sodium_mismatch = df[np.abs(df['salt_100g'] - df['sodium_100g'] * salt_to_sodium_ratio) > 0.1]
print(f"Nombre de lignes avec incohérence salt/sodium : {len(salt_sodium_mismatch)}")
salt_sodium_mismatch

Nombre de lignes avec incohérence salt/sodium : 8


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
21632,21632,24600017008,0.0,,,,0.0,102.0,40.0,0.0
32371,32371,36632038258,1.77,0.709,7.8,0.709,7.8,153.0,60.3,0.0
68008,68008,67275001088,0.0,,53.3,,0.667,84.7,33.3,0.0
109154,109154,96619911936,0.0,,,,0.0,107.0,42.0,0.0
163139,163139,855461003240,0.0,,33.3,0.0,0.0,84.7,33.3,0.0
224985,224985,3328481290718,30.0,14.6,49.2,3.85,6.92,41.0,16.2,0.0
232239,232239,3423720002432,23.0,3.33,26.7,0.741,6.3,211.0,83.0,0.0
313083,313083,8700001000923,59.4,50.0,6.25,15.6,6.25,79.4,31.2,0.0


In [8]:
# Vérifier si la somme des colonnes nutritionnelles dépasse 100 (tolérance : 1%)
nutrition_columns = ['fat_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g']
df['nutrition_sum'] = df[nutrition_columns].sum(axis=1, skipna=True)

over_100 = df[df['nutrition_sum'] > 101]
under_100 = df[df['nutrition_sum'] < 99]

print(f"Nombre de lignes avec une somme > 100 : {len(over_100)}")
print(f"Nombre de lignes avec une somme < 100 : {len(under_100)}")

# Lignes problématiques
over_100, under_100

Nombre de lignes avec une somme > 100 : 846
Nombre de lignes avec une somme < 100 : 316849


(        Unnamed: 0           code  fat_100g  saturated-fat_100g  sugars_100g  \
 87              87          34449     57.14                7.14         3.57   
 94              94          34784     56.67                5.00         6.67   
 501            501     2124810206      0.00                 NaN          NaN   
 878            878     8725247045     19.05               10.71        64.29   
 1035          1035     9349100105      0.00                0.00        60.00   
 ...            ...            ...       ...                 ...          ...   
 318501      318501  9104201471001     50.00               25.00        36.00   
 319821      319821  9313010000771      0.00                0.00       100.00   
 319825      319825   931302500027     32.80               12.40         1.00   
 319869      319869  9314488103506      0.00                0.00        82.10   
 320451      320451  9421025231179       NaN               10.40        57.40   
 
         fiber_100g  prote

In [9]:
# Résumé des anomalies
print(f"Résumé :")
print(f"- Doublons : {len(duplicates)}")
print(f"- Valeurs négatives : {len(negative_values)}")
print(f"- Valeurs > 100 : {len(high_values)}")
print(f"- 'saturated-fat_100g' > 'fat_100g' : {len(saturated_greater_fat)}")
print(f"- Incohérences salt/sodium : {len(salt_sodium_mismatch)}")
print(f"- Somme > 100 : {len(over_100)}")
print(f"- Somme < 100 : {len(under_100)}")

Résumé :
- Doublons : 0
- Valeurs négatives : 11
- Valeurs > 100 : 176
- 'saturated-fat_100g' > 'fat_100g' : 354
- Incohérences salt/sodium : 8
- Somme > 100 : 846
- Somme < 100 : 316849
