In [0]:
import numpy as np
import torch
from torch.utils.data import Dataset
import pandas as pd
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import json
import time
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import csv

In [0]:
np.random.seed(12345)

# run on the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class DataReader:
    NEGATIVE_TABLE_SIZE = 1e8

    def __init__(self, inputFileName, min_count):

        self.negatives = []
        self.discards = []
        self.negpos = 0

        self.word2id = dict()
        self.id2word = dict()
        self.sentences_count = 0
        self.token_count = 0
        self.word_frequency = dict()

        self.inputFileName = inputFileName
        self.read_words(min_count)
        self.initTableNegatives()
        self.initTableDiscards()
        
    def plot_frequency(self):
        pass
      
    def read_words(self, min_count):
        word_frequency = dict()
        for line in open(self.inputFileName, encoding="utf8"):
            line = line.split()
            if len(line) > 1:
                self.sentences_count += 1
                for word in line:
                    if len(word) > 0:
                        self.token_count += 1
                        word_frequency[word] = word_frequency.get(word, 0) + 1

                        if self.token_count % 1000000 == 0:
                            print("Read " + str(int(self.token_count / 1000000)) + "M words.")
        # show each word's frequency before the discard action
#         plot_frequency()
        wid = 0
        print()
        # w represents the word; c is the frequency of the word
        for w, c in word_frequency.items():
            if c < min_count:
                continue
            # if the counts of one word is less than min_count, then don't put this word in the vocabulary
            self.word2id[w] = wid
            self.id2word[wid] = w
            self.word_frequency[wid] = c
            wid += 1
        print("Total embeddings: " + str(len(self.word2id)))

    def initTableDiscards(self):
        t = 0.00001
        f = np.array(list(self.word_frequency.values())) / self.token_count
        # every ingredient's Probability to be discarded
        self.discards = np.sqrt(t / f) + (t / f)

    def initTableNegatives(self):
        pow_frequency = np.array(list(self.word_frequency.values())) ** 0.5
        words_pow = sum(pow_frequency)
        ratio = pow_frequency / words_pow
        count = np.round(ratio * DataReader.NEGATIVE_TABLE_SIZE)
        for wid, c in enumerate(count):
            self.negatives += [wid] * int(c)
        self.negatives = np.array(self.negatives)
        np.random.shuffle(self.negatives)

    def getNegatives(self, target, size):  # TODO check equality with target
        response = self.negatives[self.negpos:self.negpos + size]
        self.negpos = (self.negpos + size) % len(self.negatives)
        if len(response) != size:
            return np.concatenate((response, self.negatives[0:self.negpos]))
        return response

In [0]:
# -----------------------------------------------------------------------------------------------------------------

class Word2vecDataset(Dataset):
  # data is the object of class DataReader
    def __init__(self, data, window_size):
        self.data = data
        self.window_size = window_size
        self.input_file = open(data.inputFileName, encoding="utf8")

    def __len__(self):
        return self.data.sentences_count

    def __getitem__(self, idx):
        while True:
            line = self.input_file.readline()
            if not line:
                self.input_file.seek(0, 0)
                line = self.input_file.readline()

            if len(line) > 1:
                words = line.split()

                if len(words) > 1:
                    word_ids = [self.data.word2id[w] for w in words if
                                # according to the discard probabilty to decide keep this word or not so called: subsampling
                                w in self.data.word2id and np.random.rand() < self.data.discards[self.data.word2id[w]]]

                    boundary = np.random.randint(1, self.window_size)
                    # negative sampling
                    return [(u, v, self.data.getNegatives(v, 5)) for i, u in enumerate(word_ids) for j, v in
                            enumerate(word_ids[max(i - boundary, 0):i + boundary]) if u != v]

    @staticmethod
    def collate(batches):
        # u - center word
        all_u = [u for batch in batches for u, _, _ in batch if len(batch) > 0]
        # v - neighbor words
        all_v = [v for batch in batches for _, v, _ in batch if len(batch) > 0]
        all_neg_v = [neg_v for batch in batches for _, _, neg_v in batch if len(batch) > 0]

        return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v)

In [0]:
# food2vector = drive.CreateFile({'id':'17OhgD4U6nOza3Tbt7l76HRaQCllESIBj'})
# food2vector.GetContentFile('food2vec.csv')

In [0]:

"""
    u_embedding: Embedding for center word.
    v_embedding: Embedding for neighbor words.
"""


class SkipGramModel(nn.Module):

    def __init__(self, emb_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)
        self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)

        initrange = 1.0 / self.emb_dimension
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
        init.constant_(self.v_embeddings.weight.data, 0)

    def forward(self, pos_u, pos_v, neg_v):
        emb_u = self.u_embeddings(pos_u)
        emb_v = self.v_embeddings(pos_v)
        emb_neg_v = self.v_embeddings(neg_v)

        score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = -F.logsigmoid(score)

        neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze()
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)

        return torch.mean(score + neg_score)

    def save_embedding(self, id2word, file_name):
        embedding = self.u_embeddings.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(['Ingredient', 'Vector'])
            for wid, w in id2word.items():
                vector = str(list(embedding[wid]))
                csv_writer.writerow([w,vector])
        return embedding, id2word

In [0]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# from data_reader import DataReader, Word2vecDataset
# from model import SkipGramModel


class Word2VecTrainer:
    def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=10,
                 initial_lr=0.001, min_count=5):

        self.data = DataReader(input_file, min_count)
        # dataset is the object of class Word2vecDataset
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset, batch_size=batch_size,
                                     shuffle=False, num_workers=0, collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        # put model on the GPU
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension).to(device)


    def train(self):

        for iteration in range(self.iterations):

            print("\n\n\nIteration: " + str(iteration + 1))
            optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader))

            running_loss = 0.0
            for i, sample_batched in enumerate(tqdm(self.dataloader)):

                if len(sample_batched[0]) > 1:
                    # put training data on the GPU
                    pos_u = sample_batched[0].to(device)
                    pos_v = sample_batched[1].to(device)
                    neg_v = sample_batched[2].to(device)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
                    loss.backward()
                    running_loss = running_loss * 0.9 + loss.item() * 0.1
                    if i > 0 and i % 500 == 0:
                        print(" Loss: " + str(running_loss))
        return self.skip_gram_model
#     u_embeddings.weight, data.word2id, data.id2word
#         return self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name)

In [0]:
from torch.utils.data import Dataset
# -----------------------------------------------------------------------------------------------------------------
import pandas as pd

class CuisinePredictDataset(Dataset):
  # data is the object of class DataReader
    def __init__(self, inputFileName, targetFileName, w2v):
        # one-hot representation
        self.target_array = self.get_target(targetFileName)
        self.input_file = open(inputFileName, encoding="utf8")
        self.food2id = w2v.data.word2id
        self.id2food = w2v.data.id2word
        self.input_data = []
        self.max_num_gredients = 0
        # the total number of cuisine's categories
#         self.cuisine_types = 0
        # get the input_data using the method below
        self.get_padded_ingredients_ids()

    def __len__(self):
        # return the number of training data
        return len(self.target_array)
    
    def get_target(self, target_file_path):
        cuisine_data = pd.read_csv(target_file_path)
        self.cuisine_types = int(cuisine_data.nunique())
        print(self.cuisine_types)
        cuisine_array = cuisine_data.to_numpy()
        # label encoder
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(cuisine_array.ravel())
        # binary encode
        onehot_encoder = OneHotEncoder(categories='auto',sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        # integer encoded target
        self.target_integer_encoded = integer_encoded
        onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
        return onehot_encoded
#         return torch.from_numpy(onehot_encoded) # convert to tensor object

    def get_padded_ingredients_ids(self):
        while True:
            line = self.input_file.readline()
            # not the last line of the file
            if line:
                ingredients_list = line.split()
                # the number of ingredients would be changed, so 30 cannot be used
                ingredients_ids = [self.food2id[ingre] for ingre in ingredients_list if ingre in self.food2id]
                self.input_data.append(ingredients_ids)
                if self.max_num_gredients < len(ingredients_ids):
                    self.max_num_gredients = len(ingredients_ids)
            else:
                break
#         max_length = self.max_num_gredients
        print('The maximum number of the ingredients in each recipe is {}'.format(self.max_num_gredients))
        max_index = max(self.id2food.keys())
        self.input_data = [ingredient + [max_index]*(self.max_num_gredients - len(ingredient)) for ingredient in self.input_data]
                          
             
    def __getitem__(self, idx):
#         return self.input_data[idx], self.target_array.tolist()[idx]
        # get one training sample based on the index
        return [self.input_data[idx],self.target_array.tolist()[idx],self.target_integer_encoded.tolist()[idx]]
        
    @staticmethod
    def collate(batches):
        # ingredients
        all_ingredients = [batch[0] for batch in batches]
        # cusine labels one hot
        all_cusine_labels = [batch[1] for batch in batches]
        # cuisine labels integer encoded
        all_cusine_integer_encoded = [batch[2] for batch in batches]
        return torch.LongTensor(all_ingredients), torch.LongTensor(all_cusine_labels), torch.LongTensor(all_cusine_integer_encoded)

In [0]:
# after get the food2vector, do cuisine classification
class CuisineModel(nn.Module):
    def __init__(self, embedding_weight, num_class):
        super().__init__()
        self.vocab_size, self.embed_dim = embedding_weight.shape
        # padding the last row
        zero_plus = torch.zeros(1,self.embed_dim)
        self.embedding_weight = torch.cat((embedding_weight, zero_plus),dim=0)
        # embedding layer, don't update but when the weight updates the performance of the model is pretty bad!
        # freeze the embedding weight -> better result
        self.embedding = nn.EmbeddingBag.from_pretrained(self.embedding_weight, freeze=False, mode='mean')
        # fully connected layer
        self.fc = nn.Linear(self.embed_dim, num_class)
        self.init_weights()
        # the active function of the last layer: softmax
        self.soft_max = nn.Softmax(dim=1)

    def init_weights(self):
        initrange = 0.5
        # iunitialize the weights of fcl
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, ingredients_indexes):
        ingredients_embedding = self.embedding(ingredients_indexes)
        ingredients_fc = self.fc(ingredients_embedding) 
        return self.soft_max(ingredients_fc)

In [0]:
class CuisineModelCNN(nn.Module):
  
    def __init__(self, embedding_weight, num_class, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.vocab_size, self.embed_dim = embedding_weight.shape
        zero_plus = torch.zeros(1,self.embed_dim)
        self.embedding_weight = torch.cat((embedding_weight, zero_plus),dim=0)
        # size should be 20 by 100
        self.embedding = nn.Embedding.from_pretrained(self.embedding_weight,freeze=True) 
#         self.embedding = nn.Embedding(self.vocab_size + 1, self.embed_dim,sparse=True)
        # static cnn: the height of kernel doesn't change
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(3,self.embed_dim), stride=1, padding=0)
#         self.conv2 = nn.Conv2d(in_channels=18, out_channels=1, kernel_size=1, stride=1, padding=0)
#         self.maxpool2d = nn.MaxPool2d(2, stride=2)
        self.maxpool1d = nn.MaxPool1d(18)
        # fully connected layer
        self.fc1 = nn.Linear(8, num_class)
#         self.fc2 = nn.Linear(16, num_class)
#         self.fc2 = nn.Linear(1024, num_class)
#         self.init_weights()
        # the active function of the last layer: softmax
        self.soft_max = nn.Softmax(dim=1)

#     def init_weights(self):
#         initrange = 0.5
#         # iunitialize the weights of fcl
#         self.fc1.weight.data.uniform_(-initrange, initrange)
#         self.fc1.bias.data.zero_()
# #         self.fc2.weight.data.uniform_(-initrange, initrange)
# #         self.fc2.bias.data.zero_()

    def forward(self, ingredients_indexes):
        ingredients_embedding = self.embedding(ingredients_indexes)
        # reshape
        ingredients_embedding = ingredients_embedding.view(-1, 1,20,100)
#         ingredients_embedding = torch.reshape(ingredients_embedding, (1, 1, 20, 100))
        x = self.conv1(ingredients_embedding) # 32 18 18 1
#         x = torch.reshape(x, (1, 1, 18, 18))
#         x = self.conv2(x)
#         x = x.view(-1, 1,18,18)
#         x = self.conv2(x) # 32 1 16 16
#         x = self.maxpool2d(x) # 32 1 8 8 
#         print(x.shape) torch.Size([32, 1, 8, 8])
#         print(x.view(-1,18).shape) [32,18]
        x = x.view(-1,8,18)
        x = self.maxpool1d(x)
        x = self.fc1(x.view(-1,8))
#         ingredients_fc = self.fc2(x)
#         print(ingredients_fc.shape) torch.Size([32, 20])
        return self.soft_max(x)

In [0]:
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import statistics
from google.colab import files

def train_cuisine_model(train_data, validation_data, model, criterion, optimizer, batch_size, save_path):
    since = time.time()
    loss_list = []
    corrects_list = []
   
    for i, (ingredients_indexes, cuisine_label, cuisine_label_in) in enumerate(train_data):
            optimizer.zero_grad()
            ingredients_indexes, cuisine_label, cuisine_label_in = ingredients_indexes.to(device), cuisine_label.to(device), cuisine_label_in.to(device)
            outputs = model(ingredients_indexes)
            batch_size, _ = outputs.shape
            # the target is supposed to be 1-d
            loss = criterion(outputs, cuisine_label_in.squeeze())
            loss.backward()
            optimizer.step()
            _, predicts = torch.max(outputs, 1)
            if i %  29== 0:
              running_loss = loss.item()
              running_corrects = torch.sum(predicts == cuisine_label_in.squeeze()).float()
              print('Iteration time: {} loss: {:.3f} accuracy: {: .2f}'.format(i+1, running_loss, running_corrects / batch_size))
              loss_list.append(running_loss)
              corrects_list.append(running_corrects)
  #               break # debug
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    torch.save(model.state_dict(), save_path)
    print('Model has been saved successfully in {}.'.format(save_path))
#     files.download(save_path) 
    # test
#     total_accuracy = []
#     for j, (ingredients_indexes, cuisine_label, cuisine_label_in) in enumerate(validation_data):
#       ingredients_indexes, cuisine_label_in = ingredients_indexes.to(device), cuisine_label_in.to(device)
#       with torch.no_grad():
#         outputs = model(ingredients_indexes)
#         _, predicts = torch.max(outputs, 1)
#         accuracy = torch.sum(predicts == cuisine_label_in.squeeze()).float() / batch_size
#         total_accuracy.append(accuracy)
#         if j % 29 == 0:
#           print('Iteration time: {} accuracy: {: .2f}'.format(j+1, accuracy))
# #           break # debug
#     # j represents the total number of batches
#     print('Validation accuracy is {:.3f}'.format(sum(total_accuracy)/(j+1)))
    return model, loss_list, corrects_list

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
cuisine_target = drive.CreateFile({'id':'1HJYeL_lSseHqh94Lo5EvOPayhSOFKMrQ'})
cuisine_target.GetContentFile('cuisine_target.csv')
# https://drive.google.com/open?id=1HJYeL_lSseHqh94Lo5EvOPayhSOFKMrQ
ingredients = drive.CreateFile({'id':'1ZiaYobyIm00GhxFdcgtuw7P1BSFQXBli'})
ingredients.GetContentFile('ingredients.txt')
# https://drive.google.com/open?id=

Use original data instead of processed data

In [0]:
original_cuisine_target = drive.CreateFile({'id':'1E4y5JuLQpeqlbRCxef3eOt0sZRjQdC-E'})
original_cuisine_target.GetContentFile('original_cuisine_target.csv')
# https://drive.google.com/open?id=1E4y5JuLQpeqlbRCxef3eOt0sZRjQdC-E
original_ingredients = drive.CreateFile({'id':'18fbpzRaAwvsieVch0ADiHqg7mtJggzaP'})
original_ingredients.GetContentFile('original_ingredients.txt')
# https://drive.google.com/open?id=18fbpzRaAwvsieVch0ADiHqg7mtJggzaP

In [0]:
targetFileName = 'cuisine_target.csv'
inputFileName = 'ingredients.txt'

In [0]:
raw_targetFileName = 'original_cuisine_target.csv'
raw_inputFileName = 'original_ingredients.txt'
# w2v = Word2VecTrainer(input_file=raw_inputFileName, output_file="food2vec.csv") # Total embeddings: 3337
w2v = Word2VecTrainer(input_file=inputFileName, output_file="food2vec.csv") # Total embeddings: 2332
skip_gram_model = w2v.train()
# the embedding size is 3257 by 100


Total embeddings: 3257


  0%|          | 0/1205 [00:00<?, ?it/s]




Iteration: 1


 45%|████▍     | 541/1205 [00:02<00:03, 211.98it/s]

 Loss: 4.113688501556179


 85%|████████▌ | 1025/1205 [00:05<00:00, 210.24it/s]

 Loss: 3.961316169195889


100%|██████████| 1205/1205 [00:06<00:00, 200.24it/s]
  2%|▏         | 22/1205 [00:00<00:05, 213.57it/s]




Iteration: 2


 44%|████▎     | 526/1205 [00:02<00:03, 213.29it/s]

 Loss: 3.626473694863305


 85%|████████▌ | 1030/1205 [00:04<00:00, 205.29it/s]

 Loss: 3.4560466094762736


100%|██████████| 1205/1205 [00:05<00:00, 211.41it/s]
  2%|▏         | 19/1205 [00:00<00:06, 183.11it/s]




Iteration: 3


 45%|████▌     | 543/1205 [00:02<00:03, 209.87it/s]

 Loss: 3.2132203954147878


 85%|████████▍ | 1023/1205 [00:04<00:00, 209.30it/s]

 Loss: 3.1855466239184134


100%|██████████| 1205/1205 [00:05<00:00, 209.07it/s]
  2%|▏         | 21/1205 [00:00<00:05, 208.28it/s]




Iteration: 4


 44%|████▍     | 530/1205 [00:02<00:03, 208.31it/s]

 Loss: 2.9940048089776434


 86%|████████▌ | 1038/1205 [00:04<00:00, 212.45it/s]

 Loss: 2.9978273762330074


100%|██████████| 1205/1205 [00:05<00:00, 210.35it/s]
  2%|▏         | 22/1205 [00:00<00:05, 212.27it/s]




Iteration: 5


 45%|████▍     | 541/1205 [00:02<00:03, 214.73it/s]

 Loss: 2.8729544364083566


 85%|████████▌ | 1029/1205 [00:04<00:00, 213.06it/s]

 Loss: 2.8946842429742965


100%|██████████| 1205/1205 [00:05<00:00, 211.38it/s]
  2%|▏         | 23/1205 [00:00<00:05, 221.26it/s]




Iteration: 6


 44%|████▎     | 527/1205 [00:02<00:03, 213.74it/s]

 Loss: 2.788097362627227


 86%|████████▌ | 1034/1205 [00:04<00:00, 213.16it/s]

 Loss: 2.8509355317018614


100%|██████████| 1205/1205 [00:05<00:00, 212.42it/s]
  2%|▏         | 23/1205 [00:00<00:05, 221.96it/s]




Iteration: 7


 44%|████▎     | 527/1205 [00:02<00:03, 215.55it/s]

 Loss: 2.762638279150128


 86%|████████▌ | 1034/1205 [00:04<00:00, 211.74it/s]

 Loss: 2.8001600524059493


100%|██████████| 1205/1205 [00:05<00:00, 212.54it/s]
  2%|▏         | 22/1205 [00:00<00:05, 214.49it/s]




Iteration: 8


 44%|████▍     | 529/1205 [00:02<00:03, 212.87it/s]

 Loss: 2.7448284428717074


 86%|████████▌ | 1038/1205 [00:04<00:00, 216.66it/s]

 Loss: 2.7919773767416447


100%|██████████| 1205/1205 [00:05<00:00, 215.02it/s]
  2%|▏         | 23/1205 [00:00<00:05, 225.88it/s]




Iteration: 9


 44%|████▎     | 527/1205 [00:02<00:03, 214.30it/s]

 Loss: 2.7200831933248666


 86%|████████▌ | 1032/1205 [00:04<00:00, 212.66it/s]

 Loss: 2.7741753168438525


100%|██████████| 1205/1205 [00:05<00:00, 213.08it/s]
  2%|▏         | 22/1205 [00:00<00:05, 207.64it/s]




Iteration: 10


 45%|████▌     | 544/1205 [00:02<00:03, 212.92it/s]

 Loss: 2.699772932827364


 85%|████████▌ | 1028/1205 [00:04<00:00, 210.72it/s]

 Loss: 2.740092924537403


100%|██████████| 1205/1205 [00:05<00:00, 210.84it/s]


In [0]:
# batch_size = 12 needs 70 mins to finised the training process
batch_size = 32
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
# num_class = dataset.cuisine_types
dataset = CuisinePredictDataset(inputFileName=inputFileName, targetFileName=targetFileName, w2v=w2v)

20
The maximum number of the ingredients in each recipe is 20


In [0]:
 # ratio 7:3 split dataset into 7 portions for training and the rest of 3 for testing
split_ratio = 0.8
train_len = int(len(dataset) * split_ratio)
train_data, validation_data = random_split(dataset, [train_len, len(dataset)-train_len])

In [0]:
 print('{} samples for training and {} samples for validation'.format(len(train_data),len(validation_data)))

30844 samples for training and 7711 samples for validation


In [0]:
# len(skip_gram_model.u_embeddings.weight.cpu())

Fully connected layer

In [0]:
# cuisine_model = CuisineModel(embedding_weight=skip_gram_model.u_embeddings.weight.cpu(), num_class=dataset.cuisine_types).to(device)

In [0]:
cuisine_model

CuisineModel(
  (embedding): EmbeddingBag(3258, 100, mode=mean)
  (fc): Linear(in_features=100, out_features=20, bias=True)
  (soft_max): Softmax(dim=1)
)

Convolutional

In [0]:
cuisine_model_cnn = CuisineModelCNN(embedding_weight=skip_gram_model.u_embeddings.weight.cpu(), batch_size = batch_size, num_class=dataset.cuisine_types).to(device)

In [0]:
cuisine_model_cnn

CuisineModelCNN(
  (embedding): Embedding(3258, 100)
  (conv1): Conv2d(1, 8, kernel_size=(3, 100), stride=(1, 1))
  (maxpool1d): MaxPool1d(kernel_size=18, stride=18, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=8, out_features=20, bias=True)
  (soft_max): Softmax(dim=1)
)

In [0]:
# dataset.input_data[1]
len(dataset.input_data) #38555 is the true number of training samples; while the length of input_data is 19278, which is apparently incorrect

38555

In [0]:
# SGD as the optimizer
optimizer = torch.optim.SGD(cuisine_model_cnn.parameters(), lr=learning_rate, momentum=0.9)

In [0]:
train_data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=dataset.collate)

In [0]:
validation_data = DataLoader(validation_data, batch_size=batch_size, collate_fn=dataset.collate)

In [0]:
save_path='cuisine_weight_cnn_nopretrain.pth'

In [0]:
# why list out of range? how does the __item__ function work?
cuisine_model_cnn, loss, corrects = train_cuisine_model(train_data, validation_data, cuisine_model_cnn, criterion, optimizer, batch_size=batch_size, save_path=save_path)

In [0]:
files.download(save_path) 

In [0]:
cuisine_model_cnn.load_state_dict(torch.load(save_path))

<All keys matched successfully>

In [0]:
cuisine_model_cnn.to(device)

CuisineModelCNN(
  (embedding): Embedding(2333, 100)
  (conv1): Conv2d(1, 8, kernel_size=(3, 100), stride=(1, 1))
  (conv2): Conv2d(8, 1, kernel_size=(1, 1), stride=(1, 1))
  (maxpool2d): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (maxpool1d): MaxPool1d(kernel_size=18, stride=18, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=8, out_features=20, bias=True)
  (soft_max): Softmax(dim=1)
)

In [0]:
model = cuisine_model_cnn

In [0]:
total_accuracy = []
for j, (ingredients_indexes, cuisine_label, cuisine_label_in) in enumerate(validation_data):
      ingredients_indexes, cuisine_label_in = ingredients_indexes.to(device), cuisine_label_in.to(device)
      with torch.no_grad():
        outputs = model(ingredients_indexes)
        _, predicts = torch.max(outputs, 1)
        accuracy = torch.sum(predicts == cuisine_label_in.squeeze()).float() / batch_size
        total_accuracy.append(accuracy)
        if j % 29 == 0:
          print('Iteration time: {} accuracy: {: .2f}'.format(j+1, accuracy))
    # j represents the total number of batches
print('Validation accuracy is {:.3f}'.format(sum(total_accuracy)/(j+1)))
# fully connected layer 23% 
# Validation accuracy is 1.3% when update the embedding
# cnn 38.1%
# cnn iteration times: 10 count<10  53.1%
# cnn iteration times: 20 batch_size=16 52.3%

After get the food2vector, compare the result with webiste https://altosaar.github.io/food2vec/, make some progress to get better food2vec . using cnn，each user can be Regareded as a filter, the recipes he interacted can be formed as a channel map, try to predict the rating 