In [None]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
import tqdm
import shutil

os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [None]:
from importlib import reload
import utils_aws

In [None]:
utils_aws = reload(utils_aws)

# Configurations

In [None]:
PATH_DATA = '/home/ec2-user/SageMaker/dataset'
PATH_FOODI = '/home/ec2-user/SageMaker/foodi-ml'
DATASET_CSV = 'glovo-foodi-ml-dataset.csv'

conf = {
    "S3_BUCKET": 'glovo-products-dataset-d1c9720d',
    "S3_KEY_DATASET": DATASET_CSV,
    "LOCAL_RAW_DATASET": os.path.join(PATH_DATA, DATASET_CSV),
    "LOCAL_DATASET": os.path.join(PATH_DATA, 'samples'),
    "LOCAL_IMAGES": os.path.join(PATH_DATA, 'dataset'),
    "LOCAL_VOCAB": os.path.join(PATH_FOODI, '.vocab_cache/foodiml_vocab.json'),
}

# AWS

In [5]:
# AWS classes
#aws_con = utils_aws.AWSConnector(conf["S3_BUCKET"])
#awstools = utils_aws.AWSTools(aws_con)
#aws_basics = utils_aws.AWSBasics(conf["S3_BUCKET"])

# Download csv (glovo-foodi-ml-dataset.csv)

In [6]:
#command = f'aws s3 cp s3://{conf["S3_BUCKET"]}/{conf["S3_KEY_DATASET"]} {conf["LOCAL_RAW_DATASET"]} --no-sign-request'
#os.system(command)

0

# Read all samples

In [None]:
samples = pd.read_csv(os.path.join(conf['LOCAL_RAW_DATASET']))

## 1) Create sentences

In [7]:
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_section"], samples["collection_section"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

### 1.1) Fit tokenizer

In [8]:
os.chdir(PATH_FOODI)
from retrieval.data.tokenizer import Tokenizer

In [9]:
# 1) Get all sentences 
sentences = samples["sentence"].values

# 2) Fit Tokenizer with senteces (CAREFUL, takes 6-7 min)
tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
# Fit tokenize
vocab = tokenizer.fit(sentences)

100%|██████████| 2887444/2887444 [06:49<00:00, 7048.54it/s]


In [11]:
len(vocab.word2idx)

245967

In [12]:
%%time
# 3) Saving vocabulary
tokenizer.save(conf['LOCAL_VOCAB'])

CPU times: user 330 ms, sys: 4.02 ms, total: 334 ms
Wall time: 342 ms


In [None]:
# 4) Load if already saved
tokenizer = tokenizer.load(conf['LOCAL_VOCAB'])
len(tokenizer.vocab)

### 1.2) Creating dataset parquet

In [13]:
# Keep only necessary columns for modelling
final_samples = samples[["sentence", "s3_path", "subset"]].reset_index().copy()
final_samples.rename(columns={"sentence": "caption"}, inplace=True)
final_samples.rename(columns={"subset": "split"}, inplace=True)
final_samples.rename(columns={"index": "img_id"}, inplace=True)

In [14]:
# Remove previous version
if os.path.exists(conf["LOCAL_DATASET"]):
    shutil.rmtree(conf["LOCAL_DATASET"])

In [15]:
# Save as partitioned parquet
final_samples.to_parquet(
    path=conf["LOCAL_DATASET"],
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)

In [82]:
#valid_samples = pd.read_parquet(path=os.path.join(conf["LOCAL_DATASET"],'split=val'), engine="pyarrow")
#valid_samples.head()

### 1.3) Downloading images

In [76]:
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from multiprocessing import Manager

In [59]:
# Create in images the train/test/valid split
for split in final_samples.split.unique():
    os.makedirs(os.path.join(conf["LOCAL_IMAGES"], split), exist_ok=True)

In [63]:
# TODO: change to all images (remove head)
img_samples = final_samples.head(1000)

In [64]:
img_samples.head()

Unnamed: 0,img_id,caption,s3_path,split
0,0,savital shampoo keratina y sabila 550ml (00001...,dataset/NZTCKFL_0017467_1193055503.png,train
1,1,chocolate layer cake™ promotions - free waffle...,dataset/YKKVTDF_0000672_1629430873.png,train
2,2,"асорті на компанію закуски балик домашній,груд...",dataset/ZFSHHGC_0000633_24100453.png,train
3,3,suav vivere hierbas f jazmin pouch 900cc suavi...,dataset/NZTCKFL_0025186_1197233181.png,train
4,4,leoncio y el doctor veterinario infantil nan,dataset/NZTCKFL_0061335_1296271395.png,train


In [65]:
def download_img(s3_path, local_pth):
    command = f'aws s3 cp s3://{conf["S3_BUCKET"]}/{s3_path} {local_pth} --no-sign-request'
    result = os.system(command)
    if result == 0:
        return True
    else:
        return False

In [68]:
%%time
# ----------------------------- #
#  Threading Download S3 images
# ----------------------------- #
THREADS = 8
d_download = Manager().dict()
images_dict = {}

# Request to Danzai API
with ProcessPoolExecutor(max_workers=THREADS) as executor:

    # Populate the images_dict dictionary
    for _, row in img_samples.iterrows():
        s3_path = row["s3_path"]
        train_test_split = row["split"]
        s3_img = s3_path.split("/")[-1]
        
        # Use the split to download the image to a specific directory
        local_pth = os.path.join(conf["LOCAL_IMAGES"], train_test_split, s3_img)
        
        images_dict[
            executor.submit(download_img, s3_path, local_pth)
        ] = s3_img

        # Start running threads
    for i_future in as_completed(images_dict):
        image_id = images_dict[i_future]
        try:
            d_download[image_id] = i_future.result()
        except Exception:
            d_download[image_id] = False

CPU times: user 208 ms, sys: 29.1 s, total: 29.3 s
Wall time: 1min 52s


In [69]:
dict_results = dict(d_download)

In [79]:
len(os.listdir(os.path.join(conf["LOCAL_IMAGES"], "train")))

992

----------

# GloVE (in case we want to retrain a new GloVE)

In [36]:
!pip install glove_python

Collecting glove_python
  Downloading glove_python-0.1.0.tar.gz (263 kB)
[K     |████████████████████████████████| 263 kB 35.6 MB/s eta 0:00:01
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25ldone
[?25h  Created wheel for glove-python: filename=glove_python-0.1.0-cp36-cp36m-linux_x86_64.whl size=781088 sha256=b49ad1beca1ce5af45cc41f82001a84929ae3124dea2679f996b910044c2fadd
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c2/34/66/a3adc1e41bd5cfe3aa8f75e34b42ca207f8b6e8171b9a4fd61
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [37]:
from glove import Corpus, Glove

In [38]:
#Creating a corpus object
corpus = Corpus() 

In [47]:
sentences = list(final_samples["caption"].values)

In [None]:
lines = []
for sentence in tqdm.tqdm(sentences[:50000]):
    lines.append(tokenizer.split_sentence(sentence))

In [50]:
%%time
#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(lines, window=10)

CPU times: user 1.29 s, sys: 12.1 ms, total: 1.3 s
Wall time: 1.3 s


In [51]:
glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('/home/ec2-user/SageMaker/data/glove/glove-foodiml.pkl')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
