### Identifier les valeurs aberrantes dans `products.csv`

Utiliser `pandas` pour charger et analyser les données du fichier CSV `products.csv`. Repérer les valeurs aberrantes (ordre de grandeur : quelques centaines).

In [1]:
from pathlib import Path

import pandas as pd

In [2]:
DATA_DIR = Path("../../data")
product_file_path = DATA_DIR / "products.csv"

In [3]:
product_df = pd.read_csv(product_file_path, low_memory=False)

In [4]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320772 entries, 0 to 320771
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Unnamed: 0          320772 non-null  int64  
 1   code                320749 non-null  object 
 2   fat_100g            243891 non-null  float64
 3   saturated-fat_100g  229554 non-null  float64
 4   sugars_100g         244971 non-null  float64
 5   fiber_100g          200886 non-null  float64
 6   proteins_100g       259922 non-null  float64
 7   salt_100g           255510 non-null  float64
 8   sodium_100g         255463 non-null  float64
 9   autre               320772 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 24.5+ MB


In [5]:
product_df.head(10)

Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
0,0,3087,,,,,,,,100.0
1,1,4530,28.57,28.57,14.29,3.6,3.57,0.0,0.0,21.4
2,2,4559,17.86,0.0,17.86,7.1,17.86,0.635,0.25,38.435
3,3,16087,57.14,5.36,3.57,7.1,17.86,1.22428,0.482,7.26372
4,4,16094,1.43,,,5.7,8.57,,,84.3
5,5,16100,18.27,1.92,11.54,7.7,13.46,,,47.11
6,6,16117,,,,,8.89,,,91.11
7,7,16124,18.75,4.69,15.62,9.4,14.06,0.1397,0.055,37.2853
8,8,16193,37.5,22.5,42.5,7.5,5.0,,,0.0
9,9,16513,100.0,7.14,,,,,,0.0


In [6]:
product_df.describe(include="all").round(2)

Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
count,320772.0,320749.0,243891.0,229554.0,244971.0,200886.0,259922.0,255510.0,255463.0,320772.0
unique,,320579.0,,,,,,,,
top,,70650800367.0,,,,,,,,
freq,,3.0,,,,,,,,
mean,160385.5,,12.73,5.13,16.0,2.86,7.08,2.03,0.8,65.86
std,92599.04,,17.58,8.01,22.33,12.87,8.41,128.27,50.5,32.09
min,0.0,,0.0,0.0,-17.86,-6.7,-800.0,0.0,0.0,0.0
25%,80192.75,,0.0,0.0,1.3,0.0,0.7,0.06,0.02,41.91
50%,160385.5,,5.0,1.79,5.71,1.5,4.76,0.58,0.23,75.67
75%,240578.25,,20.0,7.14,24.0,3.6,10.0,1.37,0.54,94.15


In [7]:
sorted_product_df = product_df.sort_values("fat_100g", ascending=False)
product_df.head(10)

Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
0,0,3087,,,,,,,,100.0
1,1,4530,28.57,28.57,14.29,3.6,3.57,0.0,0.0,21.4
2,2,4559,17.86,0.0,17.86,7.1,17.86,0.635,0.25,38.435
3,3,16087,57.14,5.36,3.57,7.1,17.86,1.22428,0.482,7.26372
4,4,16094,1.43,,,5.7,8.57,,,84.3
5,5,16100,18.27,1.92,11.54,7.7,13.46,,,47.11
6,6,16117,,,,,8.89,,,91.11
7,7,16124,18.75,4.69,15.62,9.4,14.06,0.1397,0.055,37.2853
8,8,16193,37.5,22.5,42.5,7.5,5.0,,,0.0
9,9,16513,100.0,7.14,,,,,,0.0


Articles avec un code manquant

In [8]:
mask = product_df["code"].isna()
print(mask.value_counts())
display(product_df[mask])

code
False    320749
True         23
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
189068,189068,,,,,,,,,100.0
189103,189103,,,,,,0.137,,,99.863
189109,189109,,,,,,,,,100.0
189119,189119,,,,,,0.122,,,99.878
189152,189152,,,,,,0.158,,,99.842
189160,189160,,,,,,0.156,,,99.844
189162,189162,,,,,,0.158,,,99.842
189168,189168,,,,,,0.12,,,99.88
189242,189242,,,,,,,,,100.0
189244,189244,,,,,,,,,100.0


In [12]:
negative_mask = (
    (product_df["fat_100g"] < 0)
    |
    (product_df["saturated-fat_100g"] < 0)
    |
    (product_df["sugars_100g"] < 0)
    |
    (product_df["fiber_100g"] < 0)
    |
    (product_df["proteins_100g"] < 0)
    |
    (product_df["salt_100g"] < 0)
    |
    (product_df["sodium_100g"] < 0)
    |
    (product_df["autre"] < 0)
)

In [9]:
columns_to_check = ["fat_100g",
    "saturated-fat_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "sodium_100g",
    "autre",
]

negative_mask = (product_df[columns_to_check].fillna(0) < 0).any(axis=1)


In [13]:
negative_mask.value_counts()

False    320761
True         11
Name: count, dtype: int64

In [11]:
product_df[negative_mask]

Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
8582,8582,11213420608,0.0,0.0,-1.2,1.2,2.41,0.38354,0.151,97.05546
18209,18209,21130493432,0.8,0.0,-0.8,0.8,0.8,0.87376,0.344,97.18224
23784,23784,28400231053,33.33,13.33,0.0,-6.7,,6.43382,2.533,51.07318
33781,33781,36800416727,46.43,8.93,3.57,3.6,-3.57,0.99822,0.393,39.64878
115310,115310,4029816,0.0,,,,-500.0,25.4,10.0,564.6
117739,117739,608866999263,3.57,0.0,-3.57,3.6,7.14,0.9525,0.375,87.9325
146284,146284,789280259062,13.33,3.33,-6.67,6.7,,2.032,0.8,80.478
150858,150858,813922021028,6.25,1.25,-6.25,1.2,1.25,1.1938,0.47,94.6362
164030,164030,856336001538,21.43,3.57,-17.86,17.9,17.86,1.93294,0.761,54.40606
169119,169119,875208001230,0.0,,0.0,,-800.0,7.62,3.0,889.38


Valeurs plus grandes que 100

In [14]:
more_than_100_mask = (product_df[columns_to_check].fillna(0) > 100).any(axis=1)

In [16]:
print(more_than_100_mask.value_counts())
display(product_df[more_than_100_mask])

False    320593
True        179
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
501,501,2124810206,0.00,,,,0.00,105.83418,41.667000,0.0
1435,1435,10300335290,45.71,8.57,5.71,5.7,17.14,870.85678,342.857000,0.0
3472,3472,11110825261,0.00,0.00,,,0.00,101.60000,40.000000,0.0
7037,7037,11161036074,0.00,,,,0.00,101.60000,40.000000,0.0
7983,7983,11213053264,0.77,0.00,2.31,3.8,4.62,781.53768,307.692000,0.0
...,...,...,...,...,...,...,...,...,...,...
303101,303101,8005305900255,101.00,15.00,0.00,,0.00,0.00000,0.000000,0.0
305054,305054,8032942610032,105.00,0.30,2.50,,12.00,0.01000,0.003937,0.0
305114,305114,8033224177274,0.00,,,,0.00,106.68000,42.000000,0.0
313506,313506,8710573641501,,550.00,3520.00,5380.0,430.00,0.01016,0.004000,0.0


In [29]:
# product_df["fat_100g"] + product_df["saturated-fat_100g"] + ...

columns_to_sum = ["fat_100g",
    # "saturated-fat_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    # "sodium_100g",
    "autre",
]

product_df["sum"] = product_df[columns_to_sum].sum(axis=1)
different_than_100_mask = (
    (product_df["sum"] < 99)
    |
    (product_df["sum"] > 101)
)
print(different_than_100_mask.value_counts())

sum
False    172835
True     147937
Name: count, dtype: int64


In [26]:
display(product_df[different_than_100_mask])

Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre,sum
8,8,16193,37.50,22.50,42.5,7.5,5.00,,,0.0,115.000000
9,9,16513,100.00,7.14,,,,,,0.0,107.140000
13,13,16933,42.86,4.76,,38.1,19.05,0.09652,0.038000,0.0,104.904520
14,14,17497,48.48,9.09,,15.2,30.30,0.57658,0.227000,0.0,103.873580
25,25,18371,,,,,,96.15678,37.857000,0.0,134.013780
...,...,...,...,...,...,...,...,...,...,...,...
320356,320356,9403142000890,40.90,18.30,39.5,,9.00,0.15494,0.061000,0.0,107.915940
320392,320392,9415077412907,27.70,25.30,58.8,,3.30,0.29210,0.115000,0.0,115.507100
320451,320451,9421025231179,,10.40,57.4,0.0,10.10,45.00000,17.716535,0.0,140.616535
320578,320578,9556041130943,92.00,86.00,0.0,0.0,0.00,0.00000,0.000000,0.0,178.000000


Fat et saturated-fat

In [31]:
mask = product_df["fat_100g"] < product_df["saturated-fat_100g"]
print(mask.value_counts())
display(product_df[mask])


False    320418
True        354
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre,sum
2322,2322,11110372680,0.00,1.25,65.00,5.0,0.00,0.00000,0.000000,28.750000,98.750000
3450,3450,11110823564,1.31,1.62,,,,0.06096,0.024000,96.985040,98.356000
5839,5839,11150253642,9.21,21.05,47.37,2.6,2.63,0.43434,0.171000,16.534660,78.779000
5840,5840,11150253659,9.21,21.05,47.37,2.6,2.63,0.43434,0.171000,16.534660,78.779000
6320,6320,11150940290,1.79,5.36,0.00,0.0,3.57,1.04394,0.411000,87.825060,94.229000
...,...,...,...,...,...,...,...,...,...,...,...
316996,316996,8851613101392,18.89,19.00,20.00,0.0,16.00,0.05000,0.019685,26.040315,80.980315
317148,317148,8853662023236,0.00,2.00,8.50,,,0.75438,0.297000,88.448620,97.703000
317432,317432,88936000463,0.88,4.42,1.77,0.0,15.93,0.89916,0.354000,75.746840,95.226000
317655,317655,8935054704463,0.00,20.00,0.00,20.0,0.00,0.00000,0.000000,60.000000,80.000000


In [36]:
salt_mask = (
    (product_df["salt_100g"] < (product_df["sodium_100g"] * 2.535))
    |
    (product_df["salt_100g"] > (product_df["sodium_100g"] * 2.545))
)
print(salt_mask.value_counts())
display(product_df[salt_mask])

False    320124
True        648
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre,sum
181,181,4302544,1.110,0.333,1.110,2.220,10.00,1.520,0.6000,83.1070,99.0670
185,185,8237798,6.790,2.860,0.714,0.357,5.36,0.499,0.1960,83.2240,96.9440
702,702,79150,10.800,4.620,1.540,4.620,15.40,1.480,0.5850,60.9550,94.7950
977,977,9300003346,0.000,,,,0.00,2.810,1.1100,96.0800,98.8900
1351,1351,98724,26.300,2.630,15.800,5.260,5.26,0.134,0.0526,44.5634,97.3174
...,...,...,...,...,...,...,...,...,...,...,...
317576,317576,8906021120487,12.800,0.000,0.000,11.800,14.20,0.163,0.0640,60.9730,99.9360
317718,317718,8991002115101,12.500,12.500,60.000,,,0.191,0.0750,14.7340,87.4250
318292,318292,9019100218506,3.500,2.100,4.700,0.000,3.30,0.102,0.0400,86.2580,97.8600
318380,318380,90457388,42.500,15.000,37.500,2.500,7.50,0.191,0.0750,0.0000,90.1910


Bonus : reprendre l'étude d'une typologie de valeurs aberrantes avec Dask.

In [38]:
import dask.dataframe as dd

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [39]:
product_dask_df = dd.read_csv(product_file_path, dtype={"code": "object"})

In [44]:
product_dask_df.describe().compute()

Unnamed: 0.1,Unnamed: 0,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
count,320772.0,243891.0,229554.0,244971.0,200886.0,259922.0,255510.0,255463.0,320772.0
mean,160385.5,12.730379,5.129932,16.003484,2.862111,7.07594,2.028624,0.798815,65.861511
std,92599.044612,17.578747,8.014238,22.327284,12.867578,8.409054,128.269454,50.504428,32.091021
min,0.0,0.0,0.0,-17.86,-6.7,-800.0,0.0,0.0,0.0
25%,80192.75,0.0,0.0,1.3,0.0,0.7,0.0635,0.025,41.907242
50%,160385.5,5.0,1.79,5.71,1.5,4.76,0.58166,0.229,75.670259
75%,240578.25,20.0,7.14,24.0,3.6,10.0,1.37414,0.541,94.145336
max,320771.0,714.29,550.0,3520.0,5380.0,430.0,64312.8,25320.0,889.38


In [45]:
negative_mask = (product_dask_df[columns_to_check] < 0).any(axis=1)
neg_dask_df = product_dask_df[negative_mask]

In [46]:
print(neg_dask_df)

Dask DataFrame Structure:
              Unnamed: 0    code fat_100g saturated-fat_100g sugars_100g fiber_100g proteins_100g salt_100g sodium_100g    autre
npartitions=1                                                                                                                   
                   int64  string  float64            float64     float64    float64       float64   float64     float64  float64
                     ...     ...      ...                ...         ...        ...           ...       ...         ...      ...
Dask Name: getitem, 6 graph layers


In [47]:
neg_dask_df.compute()

Unnamed: 0.1,Unnamed: 0,code,fat_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,autre
8582,8582,11213420608,0.0,0.0,-1.2,1.2,2.41,0.38354,0.151,97.05546
18209,18209,21130493432,0.8,0.0,-0.8,0.8,0.8,0.87376,0.344,97.18224
23784,23784,28400231053,33.33,13.33,0.0,-6.7,,6.43382,2.533,51.07318
33781,33781,36800416727,46.43,8.93,3.57,3.6,-3.57,0.99822,0.393,39.64878
115310,115310,4029816,0.0,,,,-500.0,25.4,10.0,564.6
117739,117739,608866999263,3.57,0.0,-3.57,3.6,7.14,0.9525,0.375,87.9325
146284,146284,789280259062,13.33,3.33,-6.67,6.7,,2.032,0.8,80.478
150858,150858,813922021028,6.25,1.25,-6.25,1.2,1.25,1.1938,0.47,94.6362
164030,164030,856336001538,21.43,3.57,-17.86,17.9,17.86,1.93294,0.761,54.40606
169119,169119,875208001230,0.0,,0.0,,-800.0,7.62,3.0,889.38
