# PRE-PROCESSING

In [None]:
!pip install transformers

In [2]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

In [3]:
le = LabelEncoder()

Loading Dataset

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/cosylabiiit/recipe-knowledge-mining/master/ar_gk_train.tsv',sep = '\t',names = ['token','tag'])
df.head(5)

Unnamed: 0,token,tag
0,,
1,4,QUANTITY
2,cloves,UNIT
3,garlic,NAME
4,,


Checking whether any token have more than 1 word and if so then merging those words as we have a single tag for all.

In [5]:
col = df.loc[df['token'].apply(lambda x: len(str(x).split())>=2)]
print(col)
print(col["tag"].unique())

       token       tag
309    1 1/2  QUANTITY
420    1 1/2  QUANTITY
448    1 1/2  QUANTITY
585    1 1/2  QUANTITY
796    2 1/2  QUANTITY
...      ...       ...
43599  1 1/2  QUANTITY
43624  2 1/2  QUANTITY
43644  1 1/4  QUANTITY
43773  1 1/2  QUANTITY
43821  1 1/4  QUANTITY

[370 rows x 2 columns]
['QUANTITY' 'O']


In [7]:
df['token'] = df['token'].apply(lambda x: ''.join(str(x).split()))


In [9]:
col = df.loc[df['token'].apply(lambda x: len(str(x).split())>=2)]
print(col)

Empty DataFrame
Columns: [token, tag]
Index: []


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43852 entries, 0 to 43851
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   token   43852 non-null  object
 1   tag     37240 non-null  object
dtypes: object(2)
memory usage: 685.3+ KB


Doing some checking on the word **whole**. We got the token **"whole"** with 3 tags: UNIT, O, NAME

In [6]:
df["tag"][df["token"] == "whole"]

361      UNIT
1323     UNIT
1484     UNIT
2699     UNIT
3599     UNIT
4217     UNIT
4523     UNIT
5268     UNIT
5375        O
5649     UNIT
6453     UNIT
6500     UNIT
7580     UNIT
7730     UNIT
8090        O
8593     UNIT
9404     UNIT
9486     UNIT
20995       O
21006       O
21368       O
22558       O
23114       O
23240       O
23572       O
23693       O
23797       O
24358       O
24951       O
25534       O
25613       O
25956       O
26254       O
26288       O
26316       O
26558    NAME
27122       O
29835       O
29971       O
30254       O
30754       O
31001       O
31693       O
31720       O
31738       O
34829       O
36814       O
37302       O
42080       O
42262       O
43197       O
Name: tag, dtype: object

First encoding our tags with **label encoding** and later will encode them with **One hot encoding** when fitting in the model.

In [10]:
df['tag'] = le.fit_transform(df['tag'])
Y_train_labels = dict(zip(le.classes_, le.transform(le.classes_)))
print(Y_train_labels)

{'DF': 0, 'NAME': 1, 'O': 2, 'QUANTITY': 3, 'SIZE': 4, 'STATE': 5, 'TEMP': 6, 'UNIT': 7, nan: 8}


Merging the tokens and tags of each **sentence** together

In [11]:
df = df.replace({np.nan:None})

In [12]:
ings = []
tags = []
ing = []
tag = []
j=0
for idx,row in df.iterrows():
  if idx != 0:
    if row['tag'] == 8:
      tag = ' '.join(str(i) for i in tag)
      ing = ' '.join(str(i) for i in ing)
      tags.append(tag)
      ings.append(ing)
      ing = []
      tag = []
    else:
      ing.append(row['token'])
      tag.append(row['tag'])
      j+=1
    if idx == df.shape[0]-1:
      tag = ' '.join(str(i) for i in tag)
      ing = ' '.join(str(i) for i in ing)
      tags.append(tag)
      ings.append(ing)

In [13]:
ings[0],tags[0]

('4 cloves garlic', '3 7 1')

In [14]:
max_len = max([len(ing.split()) for ing in ings])
max_len

29

## BERT

Loading the pretrained bert model and tokenizer for getting **embedded vectors**.

In [15]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

A pipeline for getting the embedding vectors.
get_word_embedding **->** bert_text_preparation **->** get_bert_embeddings

In [16]:
def bert_text_preparation(text, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = marked_text.split()
    segments_ids = [1]*len(tokenized_text)
    while True:
      if len(tokenized_text)<max_len+2:
        tokenized_text.append('[PAD]')
        segments_ids.append(0)
      else:
        break
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

In [17]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    print("here4")
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2][1:]
    print("here5")
    token_embeddings = hidden_states[-1]
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    print("here6")
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [18]:
""" A function that takes the list of sentences as input and returns the embedded token as output.
    First calls the bert_text_preparation fn by passing the text and tokenizer(we haven't used this tokenizer).
    This fn returns the tokenized text with the tokens_tensors(a list with unique number for each token and 0 
    for padding) and segment_tensors(A boolean list in which 1 indicates that the word at this index is not a [PAD] token).
    Then it passses these tensors to the get_bert_embeddings fn which uses the bert model to get the token_embedding for each
    word in the sentence. Finally this embedding is returned and appended to target_word_embeddings.
"""
def get_word_embedding(ings):
  target_word_embeddings = []
  tokens = []
  for text in ings:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    print(f"tok_text = {tokenized_text}\ntoken_tensor = {tokens_tensor}\n segment_tensor = {segments_tensors}")
    
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

    target_word_embeddings.append(list_token_embeddings)

    tokens.append(tokenized_text)
  return target_word_embeddings

Embedding vectors already saved at google drive. We just need to load them and use to train our model.

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import pickle
target_word_embeddings = pickle.load(open('/content/drive/MyDrive/word_emb.sav','rb'))
# pickle.dump(target_word_embeddings,open('/content/drive/MyDrive/word_emb.sav','wb'))

In [21]:
target_word_embeddings = np.array(target_word_embeddings)


In [22]:
target_word_embeddings.shape

(6612, 31, 768)

Length found is 31 below. This is because the max len of our sentence is 29 and 2 tokens **[CLS]** and **[SEP]** are added as bert
need these tokens for producing embedding vectors.

In [23]:
twe = target_word_embeddings
len(twe[0])

31

In [24]:
ings = [ing.split() for ing in ings]
tags = [tag.split() for tag in tags]
nf = pd.DataFrame({'ingredients':ings,'tags':tags})
nf.head()

Unnamed: 0,ingredients,tags
0,"[4, cloves, garlic]","[3, 7, 1]"
1,"[2, tablespoons, vegetable, oil, ,, divided]","[3, 7, 1, 1, 2, 5]"
2,"[2, tablespoons, dried, marjoram]","[3, 7, 0, 1]"
3,"[1, large, red, onion, ,, 1/4-inch, slices, pu...","[3, 4, 1, 1, 2, 2, 2, 2, 2, 2]"
4,"[2, jalapeno, peppers, ,, seeded, and, minced]","[3, 1, 1, 2, 5, 2, 5]"


In [None]:
nf.iloc[0][1][1]

'7'

Getting the X and Y lists which will be used as input to training model. For PAD sequences we used **2** as 2 is the label encoded form of **'O'**.

In [25]:
X = []
Y = []
for idx,sent in enumerate(twe):
  x = []
  y = []
  for i,word in enumerate(sent):
    if i != 0 and i != len(nf.iloc[idx]['ingredients'])+1:
      x.append(word)
      if i <= len(nf.iloc[idx]['tags']):
        y.append(int(nf.iloc[idx]['tags'][i-1]))
      else:
        y.append(2) # FOR PAD SEQUENCE
  X.append(x)
  Y.append(y)

In [None]:
len(X[1])

29

Train test split in 80:20 ratio. Further using tf.keras.utils.to_categorically to one hot encode the y_train and y_test.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=1234)

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)


In [None]:
y_train = tf.keras.utils.to_categorical(y_train, 8)
y_test = tf.keras.utils.to_categorical(y_test, 8)

In [None]:
y_train[0][1]

array([0., 0., 0., 0., 0., 0., 0., 1.], dtype=float32)

## BILSTM MODEL

TRAINING PART (YOU DON'T NEED TO RUN THESE LINES OF CODE AS THE MODEL IS ALREADY TRAINED AND SAVED).

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional,Dense,Activation,LSTM,TimeDistributed,Dropout


In [None]:
len(X_train[0][0])

768

In [None]:
y_train

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0.

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(512, return_sequences=True), input_shape=(29, 768)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(512, return_sequences=True)))
model.add(Dropout(0.4))
model.add(TimeDistributed(Dense(8, activation = 'softmax')))
# model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=5, verbose=1, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
print(len(X_test))
score = model.evaluate(X_test,y_test)

1323


In [None]:
model.save('/content/drive/MyDrive/LSTM.h5')

Checking on custom input data..

In [None]:
# docs = ['mix together Shan chicken tikka masala, lemon juice and 2 tablespoons oil.','apply on chicken pieces and marinate for 3 hours +.', 'put each portion of chicken on skewer and place it on low heat of charcoal / gasgrill']
docs = ['4 kg oil mix with vegetables','2 litre water or petrol']
embdn = get_word_embedding(docs) #Getting embedding vectors for each word.



here
tok_text = ['[CLS]', '4', 'kg', 'oil', 'mix', 'with', 'vegetables', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
token_tensor = tensor([[  101,  1018,  4705,  3514,  4666,  2007, 11546,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]])
 segment_tensor = tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]])
here4
here5
here6
here2
here3
here
tok_text = ['[CLS]', '2', 'litre', 'water', 'or', 'petrol', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

In [None]:
embdn = np.array(embdn)
embdn.shape

(2, 31, 768)

Loading the model

In [None]:
from tensorflow import keras
tr_model = keras.models.load_model('/content/drive/MyDrive/LSTM.h5')

Preparing data by removing embedding for the [CLS] and [SEP] tags

In [None]:
test_data = []
for idx,sent in enumerate(embdn):
  t_data = []
  for i,word in enumerate(sent):
    if i != 0 and i != len(docs[idx].split())+1:
      t_data.append(word)
  test_data.append(t_data)
test_data = np.array(test_data)
test_data.shape

(2, 29, 768)

In [None]:
prediction = tr_model.predict(test_data)

In [None]:
prediction = np.array(prediction)
prediction.shape
prediction[0][0]

array([2.3338766e-06, 1.3139248e-05, 2.6589728e-04, 9.9966002e-01,
       1.9896781e-05, 1.0133885e-05, 1.0853347e-06, 2.7463186e-05],
      dtype=float32)

In [None]:
tags = []
for idx,sent in enumerate(prediction):
  tag = []
  for i,word in enumerate(sent):
    if i<len(docs[idx].split()):
      idx_max = np.argmax(np.array(word))
      tag.append(idx_max)
  tags.append(tag)
tags

[[3, 7, 1, 1, 2, 1], [3, 7, 1, 2, 1]]

In [None]:
Y_train_labels = dict(zip(le.transform(le.classes_),le.classes_))
Tags = []
for sent in tags:
  Tags.append([Y_train_labels[w] for w in sent])
a = zip(docs,Tags)
print(list(a))

[('4 kg oil mix with vegetables', ['QUANTITY', 'UNIT', 'NAME', 'NAME', 'O', 'NAME']), ('2 litre water or petrol', ['QUANTITY', 'UNIT', 'NAME', 'O', 'NAME'])]
