# Imports

In [31]:
import io
import shutil
import glob
import requests
from zipfile import ZipFile
from tempfile import mkdtemp

import pandas as pd
import numpy as np

In [63]:
# Request zip on AWS
print('load data from AWS')
zip_file_url = 'https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/parcours-data-scientist/P2/fr.openfoodfacts.org.products.csv.zip'
response = requests.get(zip_file_url)

# Unzip file
tempDir = mkdtemp()
print(f'extract to temp dir: {tempDir}')

with ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
    zip_ref.extractall(tempDir)
    
csv = glob.glob(tempDir + '\*.csv')[0]

# Read it in pandas
print('Read csv by pandas')
df = pd.read_csv(csv, delimiter='\t', parse_dates=True, dtype={'code': 'str'})

# Delete temp directory
print('Delete temp Dir')
shutil.rmtree(tempDir, ignore_errors=True)

# Display head of dataframe
df.head()

load data from AWS
extract to temp dir: C:\Users\Papoun\AppData\Local\Temp\tmpsb6q070u
Read csv by pandas


  df = pd.read_csv(csv, delimiter='\t', parse_dates=True, dtype={'code': 'str'})


Delete temp Dir


Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,generic_name,quantity,...,ph_100g,fruits-vegetables-nuts_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g
0,3087,http://world-fr.openfoodfacts.org/produit/0000...,openfoodfacts-contributors,1474103866,2016-09-17T09:17:46Z,1474103893,2016-09-17T09:18:13Z,Farine de blé noir,,1kg,...,,,,,,,,,,
1,4530,http://world-fr.openfoodfacts.org/produit/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Banana Chips Sweetened (Whole),,,...,,,,,,,14.0,14.0,,
2,4559,http://world-fr.openfoodfacts.org/produit/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Peanuts,,,...,,,,,,,0.0,0.0,,
3,16087,http://world-fr.openfoodfacts.org/produit/0000...,usda-ndb-import,1489055731,2017-03-09T10:35:31Z,1489055731,2017-03-09T10:35:31Z,Organic Salted Nut Mix,,,...,,,,,,,12.0,12.0,,
4,16094,http://world-fr.openfoodfacts.org/produit/0000...,usda-ndb-import,1489055653,2017-03-09T10:34:13Z,1489055653,2017-03-09T10:34:13Z,Organic Polenta,,,...,,,,,,,,,,


# Fast Describe

In [42]:
df.shape

(320772, 162)

In [47]:
columns = (df.dtypes).reset_index()

In [51]:
columns.loc[columns.index.isin(np.array([0,3,5,19,20,24,25,26,27,28,35,36,37,38,39,48]))]

Unnamed: 0,index,0
0,code,object
3,created_t,object
5,last_modified_t,object
19,manufacturing_places,object
20,manufacturing_places_tags,object
24,emb_codes,object
25,emb_codes_tags,object
26,first_packaging_code_geo,object
27,cities,object
28,cities_tags,object


In [60]:
pd.to_numeric(df['created_t'], errors='coerce')

0         1.474104e+09
1         1.489070e+09
2         1.489070e+09
3         1.489056e+09
4         1.489056e+09
              ...     
320767    1.490631e+09
320768    1.489059e+09
320769    1.422099e+09
320770    1.492340e+09
320771    1.489073e+09
Name: created_t, Length: 320772, dtype: float64

In [53]:
df.loc[111061:111061, 'code']

111061    01032580700553083103005000171606011016123
Name: code, dtype: object

In [56]:
df.loc[~df['manufacturing_places'].isna(), 'manufacturing_places'].head()

174    Brossard Québec
175    Brossard Québec
177     United Kingdom
180    Brossard Québec
181    Brossard,Québec
Name: manufacturing_places, dtype: object

In [None]:
manufacturing_places