In [1]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
import tqdm
os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [2]:
from importlib import reload

In [3]:
import utils_aws

In [4]:
utils_aws = reload(utils_aws)

# Configurations

In [37]:
conf = {
    "S3_BUCKET": 'glovo-products-dataset-d1c9720d',
    "S3_KEY": 'glovo-foodi-ml.csv',
    "pth_dwn_samples": '/home/ec2-user/SageMaker/data/samples',
    "pth_dwn_images": '/home/ec2-user/SageMaker/data/images',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
    "pth_dataset_json": '/home/ec2-user/SageMaker/data/foodiml/dataset_foodiml.json',
}

# AWS

In [6]:
# AWS classes
aws_con = utils_aws.AWSConnector(conf["S3_BUCKET"])
awstools = utils_aws.AWSTools(aws_con)
aws_basics = utils_aws.AWSBasics(conf["S3_BUCKET"])

In [11]:
#import boto3
#s = boto3.Session(profile_name="some_profile_name")
#c = s.client("sts")
#c.get_caller_identity()

# Download csv (glovo-foodi-ml.csv)

In [7]:
# Download csv
success = aws_basics.download_obj(
    s3_key=conf["S3_KEY"],
    destination=conf["pth_dwn_samples"]
)

Key glovo-foodi-ml.csv not found in S3


In [41]:
# Download csv
aws_con.s3_client.download_file(
                conf["S3_BUCKET"],
                'dataset/BBCMSTS_0000056_1229920994.png',
                "/tmp/"
            )

ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden

# Create vocabulary

In [None]:
samples = pd.read_csv(os.path.join(conf['pth_dwn_samples'], conf["S3_KEY"]))

## 1) Create sentences

In [22]:
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_name"], samples["collection_name"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
sentences = samples["sentence"].values

## 2) Save as partitioned parquet

In [35]:
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

In [43]:
samples.to_parquet(
    path=conf["pth_dwn_samples"],
    engine="pyarrow",
    index=False,
    partition_cols=["subset"],
)

In [44]:
samples_train = pd.read_parquet(path=os.path.join(conf["pth_dwn_samples"],'subset=train'), engine="pyarrow")

### 1.1) Fit tokenizer

In [26]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

In [27]:
from retrieval.data.tokenizer import Tokenizer

In [28]:
tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
vocab = tokenizer.fit(sentences)

100%|██████████| 2887444/2887444 [06:43<00:00, 7162.46it/s]


In [31]:
%%time
# Saving vocabulary
tokenizer.save(conf['pth_vocab'])

CPU times: user 328 ms, sys: 3.99 ms, total: 332 ms
Wall time: 331 ms


### 1.2) Creating dataset json

In [76]:
def clean_special_characters(sentence):
    # Clean isolated characters
    remove_chars = ['.', '-', ')', '(','[', ']','{','}','?','!','€','$','#','@','*', '/']
    for char in remove_chars:
        if char in sentence:
            sentence.remove(char)
    return sentence

In [77]:
# Creates the folder to dump images
img_dwn_paral = utils_aws.ImageDownloaderParallelS3(
    base_path=conf['pth_dwn_images']
)
# Create iterable of jobs and modify img_path column
jobs, samples = img_dwn_paral.create_jobs(samples)

In [78]:
samples_dataset = {
    "images": [],
    "dataset": "foodiml"
}

In [79]:
sample_size = samples.shape[0]
samples_train = int(sample_size * 0.7)

In [80]:
print("Samples train: ", samples_train)
print("Samples valid: ", sample_size - samples_train)

Samples train:  5607
Samples valid:  2404


In [81]:
for i, row in tqdm.tqdm(samples.iterrows()):
    raw_sentence = row["sentence"]
    filename = row["img_path"].split("/")[-1]
    sentence_tokens = tokenizer.split_sentence(raw_sentence)
    sentence_json = {}
    sentence_json["imgid"] = i
    sentence_json["sentences"] = [
        {
            "tokens": sentence_tokens,
            "raw": raw_sentence,
            "imgid": i

        }
    ]
    split = "train"
    if i > samples_train:
        split = "val"
    sentence_json["split"] = split
    sentence_json["filename"] = filename
    samples_dataset["images"].append(sentence_json)

8011it [00:02, 3946.59it/s]


In [82]:
# Save dataset_foodiml.json 
with open(conf['pth_dataset_json'], "w") as f:
    json.dump(samples_dataset, f)