In [2]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
import tqdm
os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [3]:
from importlib import reload

In [4]:
import utils_aws

In [5]:
utils_aws = reload(utils_aws)

# Configurations

In [119]:
conf = {
    "S3_BUCKET": 'test-bucket-glovocds',
    "S3K_imgs": 'artifacts/002/',
    "S3_file_samples": 'training_data.csv',
    "pth_dwn_samples": '/home/ec2-user/SageMaker/data/samples',
    "pth_dwn_images": '/home/ec2-user/SageMaker/data/images',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
    "pth_dataset_json": '/home/ec2-user/SageMaker/data/dataset_foodiml.json',
}

# AWS

In [7]:
# AWS classes
aws_con = utils_aws.AWSConnector(conf["S3_BUCKET"])
awstools = utils_aws.AWSTools(aws_con)
aws_basics = utils_aws.AWSBasics(conf["S3_BUCKET"])

# List cities

In [8]:
l_cities = awstools.create_list_cities(conf['S3K_imgs'])
print(l_cities)

['BCN', 'CUG']


# Download samples CSV

In [9]:
l_cities = awstools.downloading_city_csv(
    l_cities=l_cities,
    s3_key_prefix=conf['S3K_imgs'],
    csv_name=conf['S3_file_samples'],
    local_folder=conf['pth_dwn_samples'],
    verbose=True
)

Key artifacts/002/BCN/training_data.csv not found in S3
Removing from l_cities city BCN
City CUG correctly downloaded to /home/ec2-user/SageMaker/data/samples/CUG.csv


# Read and concatenate CSV

In [10]:
# Get all csv we downloaded
l_csv = os.listdir(conf['pth_dwn_samples'])

# Impose only .csv
l_csv = [file_name for file_name in l_csv if file_name.endswith(".csv")]

In [11]:
# Concatenation to a single dataframe
samples = pd.DataFrame()

In [12]:
# Read and concatenate each city dataframe to samples
for city_csv_file in tqdm.tqdm(l_csv):
    path_csv = os.path.join(conf['pth_dwn_samples'], city_csv_file)
    df_city_csv = pd.read_csv(path_csv)
    df_city_csv.insert(loc=0, 
                       column="city", 
                       value = city_csv_file.split(".csv")[0])
    samples = pd.concat([samples, df_city_csv])

100%|██████████| 1/1 [00:00<00:00, 23.62it/s]


In [13]:
samples.to_parquet(os.path.join(conf['pth_dwn_samples'], "samples.parquet"), engine = "pyarrow")

# Download images specified in samples CSV

In [80]:
samples = pd.read_parquet(os.path.join(conf['pth_dwn_samples'], "samples.parquet"), engine = "pyarrow")

In [81]:
# Creates the folder to dump images
img_dwn_paral = utils_aws.ImageDownloaderParallelS3(
    base_path=conf['pth_dwn_images']
)

In [82]:
# Create iterable of jobs and modify img_path column
jobs, samples = img_dwn_paral.create_jobs(samples)

In [30]:
# make a process pool to do the work
pool = multiprocessing.Pool(
    multiprocessing.cpu_count(), 
    img_dwn_paral.initialize,
    (conf['S3_BUCKET'],)
)

In [31]:
%%time
pool.map(img_dwn_paral.download_images, jobs)
pool.close()
pool.join()

CPU times: user 48.9 ms, sys: 18.9 ms, total: 67.8 ms
Wall time: 1min 29s


# Create vocabulary

## Create sentences

In [94]:
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"], "") + " " + \
    np.where(samples["collection_name"], samples["collection_name"], "") + " " + \
    np.where(samples["product_descr"], samples["product_descr"], "")

samples["sentence"] = samples["sentence"].str.lower()

In [95]:
sentences = samples["sentence"].values

## Fit tokenizer

In [15]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

In [17]:
from retrieval.data.tokenizer import Tokenizer

In [18]:
tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [20]:
tokenizer.vocab.word2idx

{'<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3}

In [37]:
vocab = tokenizer.fit(sentences)

100%|██████████| 8011/8011 [00:01<00:00, 7161.87it/s]


In [42]:
# Saving vocabulary
tokenizer.save(conf['pth_vocab'])

# Creating dataset json

In [102]:
def clean_special_characters(sentence):
    # Clean isolated characters
    remove_chars = ['.', '-', ')', '(','[', ']','{','}','?','!','€','$','#','@','*', '/']
    for char in remove_chars:
        if char in sentence:
            sentence.remove(char)
    return sentence

In [110]:
samples_dataset = {
    "images": [],
    "dataset": "foodiml"
}

In [112]:
for i, row in tqdm.tqdm(samples.iterrows()):
    raw_sentence = row["sentence"]
    filename = row["img_path"].split("/")[-1]
    sentence_tokens = tokenizer.split_sentence(raw_sentence)
    sentence_json = {}
    sentence_json["imgid"] = i
    sentence_json["sentences"] = [
        {
            "tokens": sentence_tokens,
            "raw": raw_sentence,
            "imgid": i

        }
    ]
    sentence_json["split"] = "train"
    sentence_json["filename"] = filename
    samples_dataset["images"].append(sentence_json)

8011it [00:02, 3724.56it/s]


In [121]:
# Save dataset_foodiml.json 
with open(conf['pth_dataset_json'], "w") as f:
    json.dump(samples_dataset, f)