In [8]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
import tqdm
os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [1]:
from importlib import reload

In [2]:
import utils_aws

In [3]:
utils_aws = reload(utils_aws)

# Configurations

In [11]:
conf = {
    "S3_BUCKET": 'glovo-products-dataset-d1c9720d',
    "S3_KEY": 'glovo-foodi-ml.csv',
    "pth_samples_csv": '/home/ec2-user/SageMaker/data/glovo-foodi-ml.csv',
    "pth_dwn_samples": '/home/ec2-user/SageMaker/data/samples_raw',
    "pth_samples": '/home/ec2-user/SageMaker/data/samples',
    "pth_dwn_images": '/home/ec2-user/SageMaker/data/images',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
    "pth_dataset_json": '/home/ec2-user/SageMaker/data/foodiml/dataset_foodiml.json',
}

# AWS

In [5]:
# AWS classes
aws_con = utils_aws.AWSConnector(conf["S3_BUCKET"])
awstools = utils_aws.AWSTools(aws_con)
aws_basics = utils_aws.AWSBasics(conf["S3_BUCKET"])

In [6]:
#import boto3
#s = boto3.Session(profile_name="some_profile_name")
#c = s.client("sts")
#c.get_caller_identity()

# Download csv (glovo-foodi-ml.csv)

In [7]:
# Download csv
success = aws_basics.download_obj(
    s3_key=conf["S3_KEY"],
    destination=conf["pth_dwn_samples"]
)

Key glovo-foodi-ml.csv not found in S3


In [41]:
# Download image to check permissions of ACL
aws_con.s3_client.download_file(
                conf["S3_BUCKET"],
                'dataset/BBCMSTS_0000056_1229920994.png',
                "/tmp/"
            )

ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden

# Read all samples

In [12]:
samples = pd.read_csv(os.path.join(conf['pth_samples_csv']))

## 1) Create sentences

In [13]:
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_name"], samples["collection_name"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
sentences = samples["sentence"].values

## 2) Save as partitioned parquet

In [14]:
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

In [17]:
samples.to_parquet(
    path=conf["pth_dwn_samples"],
    engine="pyarrow",
    index=False,
    partition_cols=["subset"],
)

In [18]:
#samples_train = pd.read_parquet(path=os.path.join(conf["pth_dwn_samples"],'subset=train'), engine="pyarrow")

### 1.1) Fit tokenizer

In [29]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

In [30]:
from retrieval.data.tokenizer import Tokenizer

In [31]:
tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
vocab = tokenizer.fit(sentences)

100%|██████████| 2887444/2887444 [06:43<00:00, 7162.46it/s]


In [56]:
len(vocab.word2idx)

245967

In [31]:
%%time
# Saving vocabulary
tokenizer.save(conf['pth_vocab'])

CPU times: user 328 ms, sys: 3.99 ms, total: 332 ms
Wall time: 331 ms


### 1.2) Creating dataset parquet

In [33]:
tokenizer = tokenizer.load(conf['pth_vocab'])

In [35]:
len(tokenizer.vocab)

245967

In [19]:
final_samples = samples[["sentence", "s3_path", "subset"]].reset_index().copy()
final_samples.rename(columns={"sentence": "caption"}, inplace=True)
final_samples.rename(columns={"subset": "split"}, inplace=True)
final_samples.rename(columns={"index": "img_id"}, inplace=True)

In [21]:
final_samples.to_parquet(
    path=conf["pth_samples"],
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)

In [187]:
#valid_samples = pd.read_parquet(path=os.path.join(conf["pth_samples"],'split=val'), engine="pyarrow")

In [274]:
valid_samples.head()

Unnamed: 0,img_id,caption,s3_path
0,2021210,aceitunas nucete verde rell fco 330grgr encurt...,dataset/HLKTVWG_0011772_943266918.png
1,2021211,combo comal burrito combo comal burrito a tu g...,dataset/YJWRKVT_0014688_307104954.png
2,2021212,"салями пицца сыр моцарелла, колбаса салями , п...",dataset/SKJGFSV_0000017_1315254831.png
3,2021213,powerade fruit punch 20 oz hidratantes nan,dataset/BBYJKXD_0001594_775792642.png
4,2021214,douglas make up nail mask contour nail care 10...,dataset/NZTCKFL_0088223_1867228388.png


# [TEMPORARY] MEANWHILE WE DON'T HAVE ACCESS TO THE IMAGES ON S3

In [22]:
dev_samples = final_samples.sample(20000).copy()
# test-bucket-glovocds
dev_samples["s3_path"] = "artifacts/002/CUG/images/0000000_0000000_1422139257.png"

In [23]:
dev_samples.to_parquet(
    path=conf["pth_samples"]+"_dev",
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)

In [24]:
dev_samples

Unnamed: 0,img_id,caption,s3_path,split
2301714,2301714,acelga manojo un verduras nan,artifacts/002/CUG/images/0000000_0000000_14221...,val
2112158,2112158,dove baby kupka i šampon 400ml dečija nega nan,artifacts/002/CUG/images/0000000_0000000_14221...,val
238096,238096,"adal - молоко ультрапастеризованное 3,2% 1л 48...",artifacts/002/CUG/images/0000000_0000000_14221...,train
649841,649841,pulpa de rata cu piure si dulceata de ardei iu...,artifacts/002/CUG/images/0000000_0000000_14221...,train
2868405,2868405,tagliatelle al huevo la spiga 250 gr pasta fre...,artifacts/002/CUG/images/0000000_0000000_14221...,test
...,...,...,...,...
1339862,1339862,gallo bomba desayunos nuestra tradicional tort...,artifacts/002/CUG/images/0000000_0000000_14221...,train
2609476,2609476,toallas sanitarias plus rapisec nosotras 10 un...,artifacts/002/CUG/images/0000000_0000000_14221...,test
2643891,2643891,abóbora partida kg legumes nan,artifacts/002/CUG/images/0000000_0000000_14221...,test
2237929,2237929,chivas regal 12 years 750cc whisky nan,artifacts/002/CUG/images/0000000_0000000_14221...,val


In [27]:
dev_samples.loc[dev_samples["img_id"] == 649841, "caption"]

649841    pulpa de rata cu piure si dulceata de ardei iu...
Name: caption, dtype: object

# GloVE

In [36]:
!pip install glove_python

Collecting glove_python
  Downloading glove_python-0.1.0.tar.gz (263 kB)
[K     |████████████████████████████████| 263 kB 35.6 MB/s eta 0:00:01
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25ldone
[?25h  Created wheel for glove-python: filename=glove_python-0.1.0-cp36-cp36m-linux_x86_64.whl size=781088 sha256=b49ad1beca1ce5af45cc41f82001a84929ae3124dea2679f996b910044c2fadd
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c2/34/66/a3adc1e41bd5cfe3aa8f75e34b42ca207f8b6e8171b9a4fd61
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [37]:
from glove import Corpus, Glove

In [38]:
#Creating a corpus object
corpus = Corpus() 

In [47]:
sentences = list(final_samples["caption"].values)

In [None]:
lines = []
for sentence in tqdm.tqdm(sentences[:50000]):
    lines.append(tokenizer.split_sentence(sentence))

In [50]:
%%time
#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(lines, window=10)

CPU times: user 1.29 s, sys: 12.1 ms, total: 1.3 s
Wall time: 1.3 s


In [51]:
glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('/home/ec2-user/SageMaker/data/glove/glove-foodiml.pkl')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [52]:
glove.

<glove.glove.Glove at 0x7f7b5c8c8cf8>