# FINAL PREPROCESS
1. Read CSV
2. Fit Tokenizer (if it is already fitted read it from disk)
3. Create a parquet file called samples
4. Save the parquet

In [63]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
from tqdm import tqdm
import shutil
from sklearn.model_selection import train_test_split

#os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [64]:
import torch
torch.__version__

'1.1.0'

In [65]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")
from retrieval.data.tokenizer import Tokenizer

# Exploration of execution

Execution 

```{bash}
cd /home/ec2-user/SageMaker/foodi-ml
source activate python3
export DATA_PATH=/home/ec2-user/SageMaker/dataset/
python run.py options/adapt/foodi-ml/i2t.yaml
python test.py options/adapt/foodi-ml/i2t.yaml

#watch -n 1 "nvidia-smi"
```

# Configurations

In [95]:
PATH_DATA = '/home/ec2-user/SageMaker/dataset/'
PATH_FOODI = '/home/ec2-user/SageMaker/foodi-ml'
DATASET_CSV = 'glovo-foodi-ml-dataset.csv'

conf = {
    "S3_BUCKET": 'glovo-products-dataset-d1c9720d',
    "S3_KEY_DATASET": DATASET_CSV,
    "LOCAL_RAW_DATASET": os.path.join(PATH_DATA, DATASET_CSV),
    "LOCAL_DATASET": os.path.join(PATH_DATA, 'samples'),
    "LOCAL_DATASET_SMALL": os.path.join(PATH_DATA, 'samples_small'),
    "LOCAL_IMAGES": os.path.join(PATH_DATA, 'dataset'),
    "LOCAL_VOCAB": os.path.join(PATH_FOODI, '.vocab_cache/foodiml_vocab.json'),
    "pth_dwn_samples": '/home/ec2-user/SageMaker/dataset/',
    "pth_dwn_images": '/home/ec2-user/SageMaker/dataset/dataset/',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
    "pth_dataset_json": '/home/ec2-user/SageMaker/dataset/dataset_foodiml.json',
}

# Read all samples

In [97]:
samples = pd.read_csv(os.path.join(conf['LOCAL_RAW_DATASET']))

In [98]:
samples.shape

(2887444, 15)

In [99]:
samples.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,country_code,city_code,store_name,product_name,collection_section,product_description,subset,hash,aux_store,HIER,s3_path
0,0,0,0,0,CL,STG,AS_000,Savital Shampoo Keratina Y Sabila 550Ml (00001...,Shampoo,,train,3509449892161349181,True,False,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...
1,1,1,1,1,KE,NRK,Cold Stone Creamery Cart,Chocolate Layer Cake™,Promotions - Free waffles on every ice cream p...,Sweet Cream Ice Cream with Crumbles of Chocola...,train,1077526765743747663,False,True,/home/ec2-user/SageMaker/dataset/dataset/YKKVT...
2,2,2,2,2,UA,DNP,Khinkali & Khachapuri / Хинкали & Хачапури,Асорті на компанію,ЗАКУСКИ,"балик домашній,грудинка копчена,паштет з печін...",train,-4606644841517710049,False,True,/home/ec2-user/SageMaker/dataset/dataset/ZFSHH...
3,3,3,3,3,CL,STG,AS_001,Suav Vivere Hierbas F Jazmin Pouch 900Cc,Suavizantes,,train,-3420249778599023770,True,False,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...
4,4,4,4,4,CL,STG,AS_000,Leoncio Y El Doctor Veterinario,Infantil,,train,2295647601757404193,True,False,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...


In [100]:
# Check that images in the dataframe are refering to the correct path
samples["s3_path"].iloc[0]

'/home/ec2-user/SageMaker/dataset/dataset/NZTCKFL_0017467_1193055503.png'

There are some images (~30) that are missing in the S3 bucket but they appear in the CSV file. We remove them.

In [101]:
missing_images = []
for p in samples["s3_path"].unique():
    if not os.path.exists(p):
        print(p)
        missing_images.append(p)

/home/ec2-user/SageMaker/dataset/dataset/LNQBTMC_0092690_1654903940.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0040488_1098020161.png
/home/ec2-user/SageMaker/dataset/dataset/PPKWSJR_0004646_60587075.png
/home/ec2-user/SageMaker/dataset/dataset/BSCRBMV_0011116_1220714661.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0214351_1380823197.png
/home/ec2-user/SageMaker/dataset/dataset/BFDKZRG_0032872_617098369.png
/home/ec2-user/SageMaker/dataset/dataset/BFDKZRG_0036209_809223163.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0211327_1380809266.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0213966_1380821411.png
/home/ec2-user/SageMaker/dataset/dataset/PPKWSJR_0002867_60584254.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0206886_1380791874.png
/home/ec2-user/SageMaker/dataset/dataset/QPCYVVZ_0009860_1097897031.png
/home/ec2-user/SageMaker/dataset/dataset/FZKQYZG_0004222_1586626572.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0211294_1380809134.pn

In [102]:
samples = samples[~samples["s3_path"].isin(missing_images)]
samples.shape

(2887399, 15)

In [103]:
samples.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'country_code', 'city_code', 'store_name', 'product_name',
       'collection_section', 'product_description', 'subset', 'hash',
       'aux_store', 'HIER', 's3_path'],
      dtype='object')

In [104]:
samples = samples.drop_duplicates(subset=["product_name", "collection_section", "product_description", "hash"])

## 1) Create sentences

In [105]:
samples["product_description"].fillna("", inplace=True)
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_section"], samples["collection_section"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

## 1.1) Fit Tokenizer (only if it is not fitted)

In [106]:
# 1) Get all sentences 
sentences = samples["sentence"].values

In [78]:
if not os.path.isfile(conf["LOCAL_VOCAB"]):
#if True:
    # 2) Fit Tokenizer with senteces (CAREFUL, takes 6-7 min)
    tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)
    # Fit tokenize
    vocab = tokenizer.fit(sentences)
    # 3) Saving vocabulary
    tokenizer.save(conf['LOCAL_VOCAB'])

In [79]:
# 4) Load if already saved
tokenizer = Tokenizer(vocab_path=conf["LOCAL_VOCAB"], download_tokenizer=True)
len(tokenizer.vocab)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


245967

## 2) Creating dataset parquet
In this section we create a parquet file that contains the DataFrame with all the captions and images of our dataset. This parquet file **will be consumed by our DataLoader class.**

In [80]:
# Keep only necessary columns for modelling
final_samples = samples[["sentence", "subset","country_code", "hash"]].reset_index().copy()
final_samples.rename(columns={"sentence": "caption"}, inplace=True)
final_samples.rename(columns={"subset": "split"}, inplace=True)
final_samples.rename(columns={"index": "img_id"}, inplace=True)

In [81]:
final_samples.split.unique()

array(['train', 'val', 'test'], dtype=object)

In [82]:
num_samples_train = (final_samples["split"] == "train").sum()
num_samples_train

1571554

### Action required: set subsample to True or False 

In [94]:
subsample = True # Set me
subsample_size = 10000
if subsample:
    final_samples_ES = final_samples[final_samples["country_code"]=="ES"]
    final_samples_ES = final_samples_ES.sample(subsample_size)
    train, val_test = train_test_split(final_samples_ES, test_size=int(subsample_size*0.3))
    val, test = train_test_split(val_test, test_size=int(val_test.shape[0]*0.5))
    print(train.shape, val.shape, test.shape)
    final_samples = pd.concat([train, val, test])

(7000, 5) (1500, 5) (1500, 5)


In [84]:
# Remove previous version
if os.path.exists(conf["LOCAL_DATASET_SMALL"]):
    shutil.rmtree(conf["LOCAL_DATASET_SMALL"])

In [96]:
# Save as partitioned parquet
final_samples.to_parquet(
    path=conf["LOCAL_DATASET_SMALL"],
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)

In [43]:
import pandas as pd
import shutil
final_samples = pd.read_parquet(conf["LOCAL_DATASET"])
final_samples.columns

Index(['img_id', 'caption', 'country_code', 'hash', 'split'], dtype='object')

In [21]:
destination_dir_root = "/home/ec2-user/SageMaker/dataset/spanish_subset/"

In [22]:
for p in final_samples["s3_path"].to_list():
    file_name = p.split("/")[-1]
    shutil.copy(p, os.path.join(destination_dir_root, file_name))

In [23]:
final_samples.to_csv("/home/ec2-user/SageMaker/dataset/spanish_subset.csv")

In [45]:
final_samples.shape

(10000, 6)

In [89]:

data = final_samples.copy()
data = data[["caption", "hash", "img_id","split"]]
data["cap_hash"] = data["caption"].apply(lambda x : hashlib.md5(str.encode(x)).hexdigest())

In [90]:
data["cap_hash"]

0          f18d61c2064a545d0abbbfb591cc5eaf
1          db7008e065490defad1f2288d4527348
2          f48378fd9223d9189ba5d6d4b2b614ac
3          64c0e1a808ba99af329ee4180e1ec1c2
4          4c0fb395efc59a9785a4b9c80e46cb8a
                         ...               
2120959    63fe259ea45b338029eeb694c8682baa
2120960    3a3fac15e1109d37050b18f6991cefd2
2120961    6501fd9506c5af42c4db9b9ed8652378
2120962    e60105c516a2bada0d4e3e71a79bdfb7
2120963    1865e08555a0bfe40231c8e6012c1d68
Name: cap_hash, Length: 2120964, dtype: object

In [93]:
valid_answers = {}
#data = pd.read_parquet("/home/ec2-user/SageMaker/dataset/samples")
data = data[data["split"]=="val"]
n = data.shape[0]
for i, row in tqdm(data.iterrows()):
    idxs_where_duplication = (data["cap_hash"] == row["cap_hash"]) | (data["hash"] == row["hash"])
    list_indexes_duplication = list(np.where(np.array(idxs_where_duplication.to_list()) == True)[0])
    valid_answers[row["img_id"]] = list_indexes_duplication

1162it [00:28, 40.78it/s]

KeyboardInterrupt: 

1162it [00:40, 29.03it/s]

In [36]:
valid_answers

{2454329: [0, 307866, 556709, 905909, 1099345, 1364996],
 2454330: [1, 618398, 943853, 1764810],
 2454332: [2],
 2454333: [3],
 2454334: [4],
 2454337: [5, 712734],
 2454340: [6,
  433913,
  488693,
  651519,
  990019,
  1112458,
  1115857,
  1334721,
  1359118,
  1469795,
  1512277,
  1650348,
  2063700],
 2454341: [7,
  106339,
  435985,
  599349,
  626551,
  1009452,
  1077714,
  1604874,
  1772681],
 2454343: [8],
 2454344: [9, 904899, 1838136],
 2454346: [10],
 2454347: [11],
 2454351: [12, 317095, 1625739, 1698063, 1769007],
 2454352: [13, 356499],
 2454354: [14, 479257, 730910, 916990, 1658203],
 2454355: [15],
 2454356: [16, 200310],
 2454357: [17],
 2454359: [18],
 2454361: [19],
 2454364: [20, 1833522, 1964237],
 2454365: [21],
 2454366: [22,
  71834,
  324439,
  328418,
  548643,
  1198214,
  1311305,
  1345777,
  1546494],
 2454367: [23],
 2454368: [24, 617877, 1434232, 1967411],
 2454369: [25],
 2454370: [26, 464338, 474167, 1050306, 1415492, 1507753, 1626902, 1854197],
 2