In [1]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
from tqdm import tqdm
import shutil

#os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [3]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")
from retrieval.data.tokenizer import Tokenizer

# Exploration of execution

Execution 

```{bash}
cd /home/ec2-user/SageMaker/foodi-ml
source activate python3
export DATA_PATH=/home/ec2-user/SageMaker/dataset/
python run.py options/adapt/foodi-ml/i2t.yaml

#watch -n 1 "nvidia-smi"
```

# Configurations

In [4]:
PATH_DATA = '/home/ec2-user/SageMaker/dataset/'
PATH_FOODI = '/home/ec2-user/SageMaker/foodi-ml'
DATASET_CSV = 'glovo-foodi-ml-dataset.csv'

conf = {
    "S3_BUCKET": 'glovo-products-dataset-d1c9720d',
    "S3_KEY_DATASET": DATASET_CSV,
    "LOCAL_RAW_DATASET": os.path.join(PATH_DATA, DATASET_CSV),
    "LOCAL_DATASET": os.path.join(PATH_DATA, 'samples'),
    "LOCAL_IMAGES": os.path.join(PATH_DATA, 'dataset'),
    "LOCAL_VOCAB": os.path.join(PATH_FOODI, '.vocab_cache/foodiml_vocab.json'),
    "pth_dwn_samples": '/home/ec2-user/SageMaker/dataset/',
    "pth_dwn_images": '/home/ec2-user/SageMaker/dataset/dataset/',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
    "pth_dataset_json": '/home/ec2-user/SageMaker/dataset/dataset_foodiml.json',
}

# Read all samples

In [5]:
samples = pd.read_csv(os.path.join(conf['LOCAL_RAW_DATASET']))

KeyboardInterrupt: 

In [102]:
samples["s3_path"].iloc[0]

'/home/ec2-user/SageMaker/dataset/dataset/NZTCKFL_0017467_1193055503.png'

## 1) Create sentences

In [103]:
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_section"], samples["collection_section"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

### 1.1) Fit tokenizer

In [53]:
os.chdir(PATH_FOODI)
from retrieval.data.tokenizer import Tokenizer

In [54]:
# 1) Get all sentences 
sentences = samples["sentence"].values

# 2) Fit Tokenizer with senteces (CAREFUL, takes 6-7 min)
tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
# Fit tokenize
vocab = tokenizer.fit(sentences)

100%|██████████| 2887444/2887444 [06:46<00:00, 7094.59it/s]


In [74]:
len(vocab.word2idx)

245967

In [57]:
%%time
# 3) Saving vocabulary
tokenizer.save(conf['LOCAL_VOCAB'])

CPU times: user 334 ms, sys: 0 ns, total: 334 ms
Wall time: 341 ms


In [75]:
# 4) Load if already saved
tokenizer = tokenizer.load(conf['LOCAL_VOCAB'])
len(tokenizer.vocab)

245967

In [59]:
# LOAD equivalent
tokenizer_2 = Tokenizer(vocab_path=conf["pth_vocab"], download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
len(tokenizer_2.vocab)

245967

### 1.2) Creating dataset parquet

In [104]:
# Keep only necessary columns for modelling
final_samples = samples[["sentence", "s3_path", "subset"]].reset_index().copy()
final_samples.rename(columns={"sentence": "caption"}, inplace=True)
final_samples.rename(columns={"subset": "split"}, inplace=True)
final_samples.rename(columns={"index": "img_id"}, inplace=True)

In [105]:
# Remove previous version
if os.path.exists(conf["LOCAL_DATASET"]):
    shutil.rmtree(conf["LOCAL_DATASET"])

In [106]:
# Save as partitioned parquet
final_samples.to_parquet(
    path=conf["LOCAL_DATASET"],
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)

## Creating dataset json

In [6]:
samples = pd.read_parquet(os.path.join(conf['pth_dwn_samples'], "samples"), engine = "pyarrow")

In [7]:
samples["s3_path"].iloc[0]

'/home/ec2-user/SageMaker/dataset/dataset/CBNGZLF_0013607_1049265587.png'

In [8]:
samples.columns

Index(['img_id', 'caption', 's3_path', 'split'], dtype='object')

## Samples for reproducibility

In [13]:
samples.split.unique()

['test', 'train', 'val']
Categories (3, object): ['test', 'train', 'val']

In [14]:
samples_train = samples[samples["split"]=="train"].sample(7000)
samples_eval = samples[samples["split"]=="val"].sample(1500)
samples_test = samples[samples["split"]=="test"].sample(1500)

In [15]:
mock_dataset = pd.concat([samples_train, samples_eval, samples_test])

In [16]:
mock_dataset.columns

Index(['img_id', 'caption', 's3_path', 'split'], dtype='object')

In [20]:
image_paths = mock_dataset.s3_path.to_list()
# shutil.copy(image_paths[0], "/home/ec2-user/SageMaker/dataset/mock_dataset") # test OK

for item in image_paths:
    shutil.copy(item, "/home/ec2-user/SageMaker/dataset/mock_dataset")

In [21]:
mock_dataset.to_csv("/home/ec2-user/SageMaker/dataset/mock_dataset.csv")

In [9]:
##################################

In [120]:
samples_dataset = {
    "images": [],
    "dataset": "foodiml"
}

In [123]:
for i, row in tqdm(samples.iterrows()):
    if i>2500:
        break
    raw_sentence = row["caption"]
    filename = row["s3_path"].split("/")[-1]
    sentence_tokens = tokenizer.split_sentence(raw_sentence)
    sentence_json = {}
    sentence_json["imgid"] = i
    sentence_json["sentences"] = [
        {
            "tokens": sentence_tokens,
            "raw": raw_sentence,
            "imgid": i
        }
    ]
    sentence_json["split"] = row["split"]
    sentence_json["filename"] = filename
    samples_dataset["images"].append(sentence_json)
    #print("raw sentence: ", raw_sentence)
    #print("filename: ", filename)
    #print("sentence_tokens :", sentence_tokens)
    #print("sentence_json: ", sentence_json)
    #print("samples_dataset:", samples_dataset)
    

2501it [00:01, 2189.07it/s]


In [124]:
# Save dataset_foodiml.json 
with open("/home/ec2-user/SageMaker/dataset/foodiml_json.json", "w") as f:
    json.dump(samples_dataset, f)

# GloVE (in case we want to retrain a new GloVE)

In [87]:
!pip install glove_python



In [88]:
from glove import Corpus, Glove

In [89]:
#Creating a corpus object
corpus = Corpus() 

In [92]:
samples.columns

Index(['img_id', 'caption', 's3_path', 'split'], dtype='object')

In [93]:
sentences = list(samples["caption"].values)

In [94]:
lines = []
for sentence in tqdm(sentences):
    lines.append(tokenizer.split_sentence(sentence))

100%|██████████| 2887444/2887444 [06:27<00:00, 7449.51it/s]


In [95]:
%%time
#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(lines, window=5)

CPU times: user 1min 24s, sys: 317 ms, total: 1min 24s
Wall time: 1min 24s


In [96]:
glove = Glove(no_components=300, learning_rate=0.05) # no_components=300 as in abstract.yaml
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('/home/ec2-user/SageMaker/dataset/glove-foodiml.pkl')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
