#### MST AIDS 2023-2024 (Département Génie Informatique)
#### Subject : The main purpose behind this lab is to get familiar with NLP language models using pre-trained GPT-2 model.
#### Realize by : Chibani Fahd
#### Topic : Generate recipes
#### Models : RNN, Bidirectional RNN, GRU and LSTM

# Part 2 Transformer (Text generation):

In [1]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [2]:
df=pd.read_csv("/kaggle/input/food-dataset/Food_Dataset.csv")
df

Unnamed: 0,TranslatedIngredients
0,"6 Karela (Bitter Gourd/ Pavakkai) - deseeded,S..."
1,"2-1 / 2 cups rice - cooked, 3 tomatoes, 3 teas..."
2,"1-1/2 cups Rice Vermicelli Noodles (Thin),1 On..."
3,"500 grams Chicken,2 Onion - chopped,1 Tomato -..."
4,"1 tablespoon chana dal, 1 tablespoon white ura..."
...,...
6292,2 cups Paneer (Homemade Cottage Cheese) - crum...
6293,1-1/2 cup Risotto - cooked risotto (recipe bel...
6294,"1 cup Quinoa,3/4 cup Sugar,1 teaspoon Cardamom..."
6295,150 grams Spring Onion (Bulb & Greens) - chopp...


In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [5]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

class FoodDataset(Dataset):
    def __init__(self):
        super().__init__()

        self.food_list = []
        self.end_of_text_token = "<|endoftext|>"

        with open("/kaggle/input/food-dataset/Food_Dataset.csv") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')

            x = 0
            for row in csv_reader:
                joke_str = f"recepies:{row[0]}{self.end_of_text_token}"
                self.food_list.append(joke_str)

    def __len__(self):
       return len(self.food_list)

    def __getitem__(self, item):
        return self.food_list[item]

In [6]:
dataset = FoodDataset()
food_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [7]:
BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400
from transformers import AdamW, get_linear_schedule_with_warmup
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [9]:
device

'cuda'

In [11]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_food_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,food in enumerate(food_loader):
        
        #################### "Fit as many recipes sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        food_tens = torch.tensor(tokenizer.encode(food[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if food_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first recipes sequence in the sequence
        if not torch.is_tensor(tmp_food_tens):
            tmp_food_tens = food_tens
            continue
        else:
            #The next recipes does not fit in so we process the sequence and leave the last recipes 
            #as the start for next sequence 
            if tmp_food_tens.size()[1] + food_tens.size()[1] > MAX_SEQ_LEN:
                work_food_tens = tmp_food_tens
                tmp_food_tens = food_tens
            else:
                #Add the recipes to sequence, continue and try to add more
                tmp_food_tens = torch.cat([tmp_food_tens, food_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_food_tens, labels=work_food_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Store the model after each epoch to compare the performance of them
torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_food_{epoch}.pt"))

sum loss 602.9174194335938
sum loss 531.919677734375
sum loss 475.10040283203125
sum loss 426.5258483886719
sum loss 392.7735900878906
sum loss 363.0346374511719
sum loss 333.07049560546875
sum loss 325.109619140625
sum loss 306.9200439453125
sum loss 293.888671875
sum loss 277.667724609375
sum loss 271.6908264160156
sum loss 266.42974853515625
sum loss 263.4397277832031
sum loss 257.0422058105469
sum loss 254.81268310546875
sum loss 255.25730895996094
sum loss 248.59228515625
sum loss 243.5207977294922
sum loss 242.1642303466797
sum loss 235.9222869873047
sum loss 233.20223999023438
sum loss 232.08453369140625
sum loss 230.86383056640625
sum loss 229.44541931152344
sum loss 224.79991149902344
sum loss 223.1371612548828
sum loss 223.06190490722656
sum loss 221.6030731201172
sum loss 222.78575134277344
sum loss 217.31240844726562
sum loss 213.71771240234375
sum loss 212.77481079101562
sum loss 209.7014617919922
sum loss 214.14166259765625
sum loss 217.23526000976562
sum loss 211.1446533

In [17]:
MODEL_EPOCH = 9

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_joker_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

food_output_file_path = f'generated_{MODEL_EPOCH}.food'

model.eval()
if os.path.exists(food_output_file_path):
    os.remove(food_output_file_path)
    
food_num = 0
with torch.no_grad():
   
        for food_idx in range(4):
        
            food_finished = False

            cur_ids = torch.tensor(tokenizer.encode("Recipes :")).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    food_finished = True
                    break

            
            if food_finished:
                
                food_num = food_num + 1
                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(food_tens_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")