In [1]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
import tqdm
os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [2]:
from importlib import reload

In [3]:
import utils_aws

In [4]:
utils_aws = reload(utils_aws)

# Configurations

In [133]:
conf = {
    "S3_BUCKET": 'glovo-products-dataset-d1c9720d',
    "S3_KEY": 'glovo-foodi-ml.csv',
    "pth_dwn_samples": '/home/ec2-user/SageMaker/data/samples_raw',
    "pth_samples": '/home/ec2-user/SageMaker/data/samples',
    "pth_dwn_images": '/home/ec2-user/SageMaker/data/images',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
    "pth_dataset_json": '/home/ec2-user/SageMaker/data/foodiml/dataset_foodiml.json',
}

# AWS

In [6]:
# AWS classes
aws_con = utils_aws.AWSConnector(conf["S3_BUCKET"])
awstools = utils_aws.AWSTools(aws_con)
aws_basics = utils_aws.AWSBasics(conf["S3_BUCKET"])

In [11]:
#import boto3
#s = boto3.Session(profile_name="some_profile_name")
#c = s.client("sts")
#c.get_caller_identity()

# Download csv (glovo-foodi-ml.csv)

In [7]:
# Download csv
success = aws_basics.download_obj(
    s3_key=conf["S3_KEY"],
    destination=conf["pth_dwn_samples"]
)

Key glovo-foodi-ml.csv not found in S3


In [41]:
# Download image to check permissions of ACL
aws_con.s3_client.download_file(
                conf["S3_BUCKET"],
                'dataset/BBCMSTS_0000056_1229920994.png',
                "/tmp/"
            )

ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden

# Create vocabulary

In [None]:
samples = pd.read_csv(os.path.join(conf['pth_dwn_samples'], conf["S3_KEY"]))

## 1) Create sentences

In [22]:
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_name"], samples["collection_name"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
sentences = samples["sentence"].values

## 2) Save as partitioned parquet

In [35]:
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

In [43]:
samples.to_parquet(
    path=conf["pth_dwn_samples"],
    engine="pyarrow",
    index=False,
    partition_cols=["subset"],
)

In [44]:
samples_train = pd.read_parquet(path=os.path.join(conf["pth_dwn_samples"],'subset=train'), engine="pyarrow")

### 1.1) Fit tokenizer

In [26]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

In [27]:
from retrieval.data.tokenizer import Tokenizer

In [28]:
tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
vocab = tokenizer.fit(sentences)

100%|██████████| 2887444/2887444 [06:43<00:00, 7162.46it/s]


In [56]:
len(vocab.word2idx)

245967

In [31]:
%%time
# Saving vocabulary
tokenizer.save(conf['pth_vocab'])

CPU times: user 328 ms, sys: 3.99 ms, total: 332 ms
Wall time: 331 ms


### 1.2) Creating dataset parquet

In [183]:
final_samples = samples[["sentence", "s3_path", "subset"]].copy()
final_samples.rename(columns={"sentence": "caption"}, inplace=True)
final_samples.rename(columns={"subset": "split"}, inplace=True)
final_samples = final_samples.reset_index()
final_samples.rename(columns={"index": "img_id"}, inplace=True)

In [186]:
final_samples.to_parquet(
    path=conf["pth_samples"],
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)

In [187]:
#valid_samples = pd.read_parquet(path=os.path.join(conf["pth_samples"],'split=val'), engine="pyarrow")

In [188]:
valid_samples.head()

Unnamed: 0,img_id,caption,s3_path
0,2021210,aceitunas nucete verde rell fco 330grgr encurt...,dataset/HLKTVWG_0011772_943266918.png
1,2021211,combo comal burrito combo comal burrito a tu g...,dataset/YJWRKVT_0014688_307104954.png
2,2021212,"салями пицца сыр моцарелла, колбаса салями , п...",dataset/SKJGFSV_0000017_1315254831.png
3,2021213,powerade fruit punch 20 oz hidratantes nan,dataset/BBYJKXD_0001594_775792642.png
4,2021214,douglas make up nail mask contour nail care 10...,dataset/NZTCKFL_0088223_1867228388.png


In [198]:
dev_samples = final_samples.head(50000).copy()
# test-bucket-glovocds
dev_samples["s3_path"] = "artifacts/002/CUG/images/0000000_0000000_1422139257.png"

In [199]:
dev_samples.to_parquet(
    path=conf["pth_samples"]+"_dev",
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)

In [200]:
conf["pth_samples"]+"_dev"

'/home/ec2-user/SageMaker/data/samples_dev'

In [201]:
from collections import defaultdict
image_ids = list(dev_samples["img_id"].values)
img_dict = {}
annotations = defaultdict(list)
for _, row in tqdm.tqdm(dev_samples.iterrows()):
    img_dict[row["img_id"]] = row["s3_path"]
    annotations[row["img_id"]].extend([row["caption"]])

50000it [00:04, 12345.29it/s]


In [202]:
image_ids[:5]

[0, 1, 2, 3, 4]

In [204]:
img_dict[0]

'artifacts/002/CUG/images/0000000_0000000_1422139257.png'

In [205]:
len(annotations)

50000

In [206]:
row["caption"]

'albenzol susx100mgx5mlx20ml parasitos - lombrices nan'

In [207]:
len(annotations)

50000

In [208]:
list(annotations.keys())[:5]

[0, 1, 2, 3, 4]

In [209]:
s3_key = img_dict[0]
bucket_name = "test-bucket-glovocds"

In [210]:
# Boto 3
session = boto3.Session()
s3_resource = session.resource('s3')
bucket = s3_resource.Bucket(bucket_name)