## DataLoader
Responsible to load the data during different phases (train, test) <br /> 
Data laoder reads all files and for each recipe returns its instructions, ingredeints and 6 corresponding images

In [3]:
from __future__ import print_function
import torch.utils.data as data
from PIL import Image
import os
import sys
import pickle
import numpy as np
import lmdb
import torch
import torchvision.transforms as transforms


In [53]:
class Loader(data.Dataset):

    def default_loader(path):
        try:
            im = Image.open(path).convert('RGB')
            return im
        except:
            return Image.new('RGB', (224, 224), 'white')

    
    def __init__(self, img_path, transform=None,
                 loader=default_loader, square=False, data_path=None, partition=None):
        
        if data_path == None:
            raise Exception('No data path specified.')

        if partition is None:
            raise Exception('Unknown partition type %s.' % partition)
        else:
            self.partition = partition

        self.env = lmdb.open(os.path.join(data_path, partition + '_lmdb'), max_readers=1,
        readonly=True, lock=False,readahead=False, meminit=False)

        with open(os.path.join(data_path, partition + '_keys.pkl'), 'rb') as f:
            self.ids = pickle.load(f)

        self.square = square
        self.imgPath = img_path
        self.mismtch = 0.8
        self.maxInst = 20    
 
        self.transform = transform
        self.loader = loader

    def __getitem__(self, index):

        recipId = self.ids[index]
    
        # we force 80 percent of them to be a mismatch
        if self.partition == 'train':
            match = np.random.uniform() > self.mismtch
        elif self.partition == 'val' or self.partition == 'test':
            match = True
        else:
            raise 'Partition name not well defined'

        target = match and 1 or -1
        
        
        with self.env.begin(write=False) as txn:
            serialized_sample = txn.get(self.ids[index].encode())
        sample = pickle.loads(serialized_sample,encoding='latin1')
    
        imgs = sample['imgs']
        
        # image
        if target == 1:
            if self.partition == 'train':
                # We choose from all the images in a recipe
                imgIdx = np.random.choice(range(len(imgs)))
            else:
                imgIdx = 0
            
            #get the first 4 characters of the id. the first four is enough to reach to the desired folder
            loader_path = [imgs[imgIdx]['id'][i] for i in range(4)]
            
            #use it as a path 
            loader_path = os.path.join(*loader_path)
            path = os.path.join(self.imgPath, self.partition, loader_path, imgs[imgIdx]['id'])

        else:
            # we randomly pick one non-matching image
            all_idx = range(len(self.ids))
            rndindex = np.random.choice(all_idx)
            while rndindex == index:
                rndindex = np.random.choice(all_idx)  # pick a random index

            with self.env.begin(write=False) as txn:
                serialized_sample = txn.get(self.ids[rndindex].encode())

            rndsample = pickle.loads(serialized_sample,encoding='latin1')
            rndimgs = rndsample['imgs']

            if self.partition == 'train':  # if training we pick a random image
                # We do only use the first five images per recipe during training
                imgIdx = np.random.choice(range(min(5, len(rndimgs))))
            else:
                imgIdx = 0

            path = self.imgPath + rndimgs[imgIdx]['id']
            
        # load image
        img = self.loader(path)

        if self.square:
            img = img.resize(self.square)
        if self.transform is not None:
            img = self.transform(img)
        
        
        # instructions
        instrs = sample['intrs']
        itr_ln = len(instrs)
        t_inst = np.zeros((self.maxInst, np.shape(instrs)[1]), dtype=np.float32)
        t_inst[:itr_ln][:] = instrs
        instrs = torch.FloatTensor(t_inst)

        # ingredients
        ingrs = sample['ingrs'].astype(int)
        ingrs = torch.LongTensor(ingrs)
        igr_ln = max(np.nonzero(sample['ingrs'])[0]) + 1
       

        rec_class = sample['classes'] - 1
        rec_id = self.ids[index]

        if target == -1:
            img_class = rndsample['classes'] - 1
            img_id = self.ids[rndindex]
        else:
            img_class = sample['classes'] - 1
            img_id = self.ids[index]

        
    
        category = sample['classes']
        
        
        return recipId, img, instrs, ingrs, category, target


    def __len__(self):
        return len(self.ids)



## Generating train and test data
we generate the training data in a way to force 80 percent of image-recipe pairs are mismatched and only 20 percent of them are matched.  <br />
For generating the test data, we force to create  matched recipe-image pairs.  <br />
batch size for train and test is 6.

In [98]:
batch_size = 1

params = {'batch_size': batch_size, 'shuffle':True, 'pin_memory':True}

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

transform = transforms.Compose([
            transforms.Scale(256), # rescale the image keeping the original aspect ratio
            transforms.CenterCrop(256), # we get only the center of that rescaled
            transforms.RandomCrop(224), # random crop within the center crop 
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize
            ])


training_set = Loader(data_path='../data', partition = 'train', img_path='../data' , transform=transform)
training_generator = data.DataLoader(training_set, **params)

test_set = Loader(data_path='../data', partition = 'test', img_path='../data' , transform=transform)
test_generator = data.DataLoader(test_set, **params)

f = open('../data/recipe2ingredients.pkl','wb') 

for i, (recipeId, img, instrs, ingrs, category, target) in enumerate(training_generator): 

    with open('../data/recipe2ingredients.pkl'.format(recipeId[0]),'ab+') as f:
            pickle.dump({'recipeId':recipeId[0],'ingrs':ingrs[0]},f)
       
    if(i==100):
        break
    
   


In [99]:
recipe2ingredients = []
with open('../data/recipe2ingredients.pkl','rb') as f:
    while True:
        try:
            recipe2ingredients.append(pickle.load(f))
        except:
            print("finished reading file")
            break
    
    #ingrs = pickle.load(f)

finished reading file


## Sample of ingredient vector for one Recipe
id is 99

In [101]:
print(recipe2ingredients[99])


{'recipeId': '3dd0b17c87', 'ingrs': tensor([ 5824,  4514,  6595, 15206,   839,  1704,  3913,   466,  5552,  5393,
        27698,  1491, 22391,  2456, 27804,  3646, 17791,     1,     0,     0])}
