# Imports

In [None]:
import io
import shutil
import glob
import requests
from zipfile import ZipFile
from tempfile import mkdtemp

import pandas as pd
import numpy as np

# Lecture du dataset

In [None]:
# Request zip on AWS
print('load data from AWS')
zip_file_url = 'https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/parcours-data-scientist/P2/fr.openfoodfacts.org.products.csv.zip'
response = requests.get(zip_file_url,)

# Unzip file
tempDir = mkdtemp()
print(f'extract to temp dir: {tempDir}')

with ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
    zip_ref.extractall(tempDir)
    
csv = glob.glob(tempDir + '\*.csv')[0]

# Read it in pandas
print('Read csv by pandas')
df = pd.read_csv(csv, delimiter='\t',
                 parse_dates=True,
                 dtype={'code': 'str',
                        'manufacturing_places': 'str',
                        'manufacturing_places_tags': 'str',
                        'emb_codes': 'str',
                        'emb_codes_tags': 'str',
                        'cities': 'str',
                        'cities_tags': 'str',
                        'allergens': 'str',
                        'allergens_fr': 'str',
                        'traces': 'str',
                        'traces_tags': 'str',
                        'traces_fr': 'str',
                        'ingredients_from_palm_oil_tags': 'str',
                        'first_packaging_code_geo': 'str'
                       })

# Delete temp directory
print('Delete temp Dir')
shutil.rmtree(tempDir, ignore_errors=True)

# Display head of dataframe
df.head()

In [None]:
pd.to_numeric(df['created_t'], errors='coerce')
pd.to_numeric(df['last_modified_t'], errors='coerce')

# Fast Describe

In [None]:
df.shape

## Remove empties and unused columns

In [None]:
empties_columns = df.loc[:, df.isna().sum(axis=0) > df.shape[0]*0.7].columns.to_list()

In [None]:
unused_columns = ['creator',
                  'created_t',
                  'created_datetime',
                  'last_modified_t',
                  'last_modified_datetime'
                 ]

In [None]:
df_subset = df.drop(empties_columns + unused_columns, axis=1)

In [None]:
df_subset.shape

In [None]:
df_subset.isna().mean().sort_values(ascending=False)

In [None]:
df_subset.loc[:, 'caprylic-acid_100g']