In [1]:
# working ner

import pandas as pd
import numpy as np
import json
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from sklearn.metrics import accuracy_score
import torch
from torcheval.metrics.functional import multilabel_accuracy
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import time
import re

import ast
from tqdm import tqdm

if torch.cuda.is_available():
    device = torch.device("cuda") 
else:
    device = torch.device("cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
pretrained = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(pretrained)

MAX_LEN = 300
LEARNING_RATE = 3e-05

In [3]:
data = pd.read_csv("test_food.csv")
data['tag'] = data['tag'].apply(ast.literal_eval)

In [4]:
def tag_tokens(original, ids, og_tag):
    tokens = (tokenizer.convert_ids_to_tokens([int(i) for i in ids]))
    tagt = [0]
    tsplit = original.split()
    sent_ind = 0
    token_ind = 0
    puncs = ';:.,()-"\''
    # tag tokens
    while sent_ind < (len(tsplit)):
        while token_ind < len(tokens):
            if tokens[token_ind]=='<s>':
                token_ind += 1
                continue
            temp_t = tokens[token_ind]
            temp_t = temp_t.replace('Ġ', '')
            if temp_t in puncs:
                tagt.append(0)
                token_ind += 1
            elif temp_t in tsplit[sent_ind]:
                tagt.append(og_tag[sent_ind])
                token_ind += 1
            else:
                break
        sent_ind += 1
    tagt += [0]
    return tagt

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['sentences']
        self.targets = dataframe['tag']
        self.max_len = max_len

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        temp_target = tag_tokens(text, ids, self.targets[index])
        target = temp_target+[0]*(MAX_LEN-len(temp_target))


        return torch.tensor(ids, dtype=torch.long).to(device), torch.tensor(mask, dtype=torch.long).to(device), torch.tensor(token_type_ids, dtype=torch.long).to(device), torch.LongTensor(target).to(device)
        

In [5]:
id_pipeline = lambda x: x
mask_pipeline = lambda x: x
token_pipeline = lambda x: x
target_pipeline = lambda x: x
def colbatch(batch):
    id_list, mask_list, token_type_id_list, target_list = [], [], [], []
    for (_id, _mask, _tokentype, _target) in batch:
        id_list.append(id_pipeline(_id))
        mask_list.append(mask_pipeline(_mask))
        token_type_id_list.append(token_pipeline(_tokentype))
        target_list.append(target_pipeline(_target))
    id_list = torch.stack(id_list)
    mask_list = torch.stack(mask_list)
    token_type_id_list = torch.stack(token_type_id_list)
    target_list = torch.stack(target_list)
    return id_list.to(device), mask_list.to(device), token_type_id_list.to(device), target_list.to(device)

In [6]:
test_customset = CustomDataset(data, tokenizer, MAX_LEN)
test_loader = DataLoader(test_customset, batch_size=1, shuffle=False, collate_fn=colbatch)

In [30]:
id2label = {0:'None', 1:'B-food', 2:'I-food'}
label2id = {v:k for i,(k, v) in enumerate(id2label.items())}
model_c = transformers.AutoModelForTokenClassification.from_pretrained(model_name, 
                                                                        num_labels=3,
                                                                        id2label=id2label,
                                                                        label2id=label2id)

torch.Size([1, 300])

In [31]:
id2label = {0:'O', 1:'B-food', 2:'I-food'}
label2id = {v:k for i,(k, v) in enumerate(id2label.items())}

In [32]:
model_f = AutoModelForTokenClassification.from_pretrained(pretrained,
                                                          num_labels=3,
                                                          id2label=id2label,
                                                          label2id=label2id)
model_f.to(device)
# print(model_f)
def count_parameters(modelr):
    return sum(p.numel() for p in modelr.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model_f):,} trainable parameters')

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 81,530,115 trainable parameters


In [5]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min
optimizer = torch.optim.Adam(params =  model_f.parameters(), lr=LEARNING_RATE)

In [33]:
model1,_,_,loss1 = load_ckp('best_model_f.pt', model_f, optimizer)
model2,_,_,loss2 = load_ckp('current_checkpoint_f.pt', model_f, optimizer)

In [12]:
print(loss1, loss2)

0.002158557913219991 0.002385193007745533


In [13]:
model1.to(device)
def predict(model, loader):
    model.eval()
    with torch.no_grad():
        for batch_idx, (ids, mask, tokentype, targets) in enumerate(loader):
            outputs = model(input_ids=ids, attention_mask=mask, token_type_ids=tokentype, labels=targets)
            
            loss, tr_logits = outputs.loss, outputs.logits
            # valid_loss += loss.item()
            # print((tr_logits))
            # compute training accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            print(flattened_predictions)

In [14]:
# predict(model1, test_loader)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0,
        0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

KeyboardInterrupt: 

In [42]:
from transformers import pipeline
pipe = pipeline(task="ner", model=model1.to("cpu"), binary_output=True, tokenizer=tokenizer, aggregation_strategy='first')

In [43]:
# pipe("In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoons, drop and shape into 30 clusters on wax paper. Let stand until firm, about 30 minutes.")
pipe("180ml fresh orange juice; 125ml fresh lime juice; Medium handful coarsely chopped fresh oregano leaves; 6 cloves garlic, coarsely chopped; 1 tsp ground cumin; 60ml canola oil; 4 (230g) pork chops, butterflied and thinly lb.ed; Salt and freshly ground black pepper; 8 (0.5-cm) thick slices Swiss cheese; 8 (0.5-cm) thick slices boiled ham; 2 sour dill pickles, thinly sliced (need about 16 slices); 2 tbsps chopped coriander leaves; 60ml olive oil")
# pipe("1) Heat the barbecue to high. 2) Whisk together 120 ml orange juice, 60 ml lime juice, 3 tbsps oregano, the garlic, cumin, and canola oil in a large baking dish. Add the pork and turn to coat. Cover and let marinate for at least 15 minutes and up to 2 hours in the refrigerator. 3) Remove the pork from the marinade and pat dry. Place the chops on a flat surface, cut-side up and season with salt and pepper. Place 1 slice of cheese, 2 slices of ham, a few slices of pickle and another slice of cheese on 1 half of the chop. Fold over brush the top with oil and season with salt and pepper. Repeat with remaining ingredients. 4) Place the chops on the barbecue, oil side down and grill until golden brown, 3.5 minutes. Flip the chops over and continue cooking until the bottom is golden brown and the cheese has melted, 2.5 minutes longer. 5) Whisk the remaining orange and lime juices, oregano, and the coriander with the olive oil and salt and pepper, to taste. Spoon over the chops and serve.")
# pipe("Add 1/8 teaspoon salt and shortening.")

[{'entity_group': 'food',
  'score': 0.999652,
  'word': ' orange juice;',
  'start': 12,
  'end': 25},
 {'entity_group': 'food',
  'score': 0.9996369,
  'word': ' lime juice;',
  'start': 38,
  'end': 49},
 {'entity_group': 'food',
  'score': 0.9993527,
  'word': ' oregano leaves;',
  'start': 88,
  'end': 103},
 {'entity_group': 'food',
  'score': 0.899761,
  'word': ' garlic,',
  'start': 113,
  'end': 120},
 {'entity_group': 'food',
  'score': 0.9991276,
  'word': ' ground cumin;',
  'start': 145,
  'end': 158},
 {'entity_group': 'food',
  'score': 0.9599212,
  'word': ' canola oil;',
  'start': 164,
  'end': 175},
 {'entity_group': 'food',
  'score': 0.9997432,
  'word': ' pork chops,',
  'start': 185,
  'end': 196},
 {'entity_group': 'food',
  'score': 0.99811196,
  'word': ' butterflied',
  'start': 197,
  'end': 208},
 {'entity_group': 'food',
  'score': 0.9994885,
  'word': ' Salt',
  'start': 227,
  'end': 231},
 {'entity_group': 'food',
  'score': 0.99917984,
  'word': ' gro

In [45]:
from evaluate import load
r_squared = load('r_squared')
r_squared.compute(predictions=[1, 2, 3, 4], references=[0.9, 2.1, 3.2, 3.8])

0.98

# NER PIPELINE

In [96]:
from transformers import pipeline
pipe = pipeline(task="ner", model=model1.to("cpu"), binary_output=True, tokenizer=tokenizer)
def ner_pipe(test_sent):
    out = pipe(test_sent)
    temp = ""
    switch = False
    for i in out:
        if i['entity'] != "None":
            temp += (i['word'])
        elif temp:
            print(temp.replace("Ġ", " ").strip())
            temp = ""
    if temp:
        print(temp.replace("Ġ", " ").strip())
tsent = ("In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoons, drop and shape into 30 clusters on wax paper. Let stand until firm, about 30 minutes.")
ner_pipe(tsent)

brown sugar
nuts
evaporated milk
butter
margarine
vanilla
cereal


In [19]:
tr_accuracy = 0
train_loss= []
train_acc= []
with torch.no_grad():
    model1.to(device)
    for batch_idx, (ids, mask, tokentype, targets) in enumerate(test_loader):
        outputs = model1(input_ids=ids, attention_mask=mask, token_type_ids=tokentype, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model1.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        # print(outputs)
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        train_loss.append(loss.item())
        train_acc.append(tmp_tr_accuracy)
        
    # print(torch.argmax(outputs.logits, axis=1))

In [21]:
tlen = len(test_loader)
print(tr_accuracy/tlen)
print(np.mean(train_loss))

0.9913656615559
0.0024432526265883428


In [25]:
plosses= []
plogits = []
with torch.no_grad():
    model1.to(device)
    for batch_idx, (ids, mask, tokentype, targets) in enumerate(test_loader):
        if batch_idx%100==0:
            print(batch_idx, end='\r')
        outputs = model1(input_ids=ids, attention_mask=mask, token_type_ids=tokentype, labels=targets)
        # loss, tr_logits = outputs.loss, outputs.logits
        plosses.append(outputs.loss)
        plogits.append(outputs.logits)

7700

In [42]:
preds = [torch.argmax(p.view(-1, model1.num_labels), axis=1) for p in plogits]

In [45]:
preds = [p.tolist() for p in preds]

In [50]:
ptags = data['tag'].tolist()

In [61]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
# target_names = ['class 0', 'class 1', 'class 2']
mlb = MultiLabelBinarizer()
# tagsml = [j for i in ptags for j in i]
predsml = [j for i in preds for j in i]

ValueError: Found input variables with inconsistent numbers of samples: [119828, 2312400]

In [64]:
tagsml = []
for _,(_,_,_,t) in enumerate(test_loader):
    tagsml.append(t)

In [72]:
ttt = [p.tolist()[0] for p in tagsml]

In [75]:
tags_stacked = [j for i in ttt for j in i]

In [77]:
target_names = ['O', 'Food-B', 'Food-I']
print(classification_report(tags_stacked, predsml, target_names=target_names))


              precision    recall  f1-score   support

           O       1.00      1.00      1.00   2283356
      Food-B       0.97      0.98      0.97     24047
      Food-I       0.92      0.89      0.91      4997

    accuracy                           1.00   2312400
   macro avg       0.96      0.96      0.96   2312400
weighted avg       1.00      1.00      1.00   2312400



In [100]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import json

toker = AutoTokenizer.from_pretrained("Dizex/InstaFoodRoBERTa-NER")
model_NER = AutoModelForTokenClassification.from_pretrained("Dizex/InstaFoodRoBERTa-NER")

pipe_NER = pipeline("ner", model=model_NER, tokenizer=tokenizer, binary_output=True)

In [132]:
# iids = tokenizer(test_sent)['input_ids']
# iids = torch.tensor([iids], dtype=torch.long)
# iids
model_NER.to(device)
pllm_out = []
with torch.no_grad():
    # for i, (iids, _,)
    # out = model_NER(input_ids = iids)
    for batch_idx, (ids, mask, tokentype, targets) in enumerate(test_loader):
        if batch_idx%100==0:
            print(batch_idx, end='\r')
        outputs = model_NER(input_ids=ids, attention_mask=mask, token_type_ids=tokentype)
        pllm_out.append([torch.argmax(p.view(-1, model1.num_labels), axis=1) for p in outputs.logits][0].tolist())
    # out.logits

7700

In [139]:
pllm_out_stacked = [j for i in pllm_out for j in i]
print(classification_report(tags_stacked, pllm_out_stacked, target_names=target_names))

              precision    recall  f1-score   support

           O       1.00      0.90      0.95   2283356
      Food-B       0.10      0.96      0.18     24047
      Food-I       0.20      0.86      0.33      4997

    accuracy                           0.90   2312400
   macro avg       0.43      0.91      0.49   2312400
weighted avg       0.99      0.90      0.94   2312400



In [119]:
next(iter(test_loader))

(tensor([[    0,  1121,  2576,     9,   508,  3023,   361,  3023,   132,    12,
           3796, 14814,  8847,     6,  2504,   112,    73,   176,     9,  4884,
          12652,     4,  3107,    19,   195, 17082, 19485,     8,   299,    19,
            112,    73,   176,  9039, 16037,  7134, 12652,     4,  3107,    19,
            112,    73,   176,  4436, 16916,  2722,  7134,     6,  7230,  4477,
           2961,     6,  3558,    19,  7134,     4, 24138,     6, 13526,     6,
             11, 10088,  4176, 12941,   291,     7,   389,   728,     4,     2,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,   

In [107]:
out = pipe_NER("1 c. firmly packed brown sugar, 0.5 c. evaporated milk, 0.5 tsp. vanilla, 0.5 c. broken nuts (pecans), 2 Tbsp. butter or margarine, 3.5 c. bite size shredded rice biscuits")
for p in out:
    print(p)
test_sent = ("1 c. firmly packed brown sugar, 0.5 c. evaporated milk, 0.5 tsp. vanilla, 0.5 c. broken nuts (pecans), 2 Tbsp. butter or margarine, 3.5 c. bite size shredded rice biscuits")
tokenizer.convert_ids_to_tokens([int(i) for i in tokenizer(test_sent)['input_ids']])
# print(tokenizer(test_sent)['input_ids'])

{'entity': 'B-FOOD', 'score': 0.9998061, 'index': 6, 'word': 'Ġbrown', 'start': 19, 'end': 24}
{'entity': 'I-FOOD', 'score': 0.9995023, 'index': 7, 'word': 'Ġsugar', 'start': 25, 'end': 30}
{'entity': 'B-FOOD', 'score': 0.9823137, 'index': 16, 'word': 'Ġmilk', 'start': 50, 'end': 54}
{'entity': 'B-FOOD', 'score': 0.9998919, 'index': 23, 'word': 'Ġvanilla', 'start': 65, 'end': 72}
{'entity': 'B-FOOD', 'score': 0.9999249, 'index': 31, 'word': 'Ġnuts', 'start': 88, 'end': 92}
{'entity': 'B-FOOD', 'score': 0.9999355, 'index': 33, 'word': 'pec', 'start': 94, 'end': 97}
{'entity': 'B-FOOD', 'score': 0.99996436, 'index': 34, 'word': 'ans', 'start': 97, 'end': 100}
{'entity': 'B-FOOD', 'score': 0.9999305, 'index': 40, 'word': 'Ġbutter', 'start': 111, 'end': 117}
{'entity': 'B-FOOD', 'score': 0.9999244, 'index': 42, 'word': 'Ġmarg', 'start': 121, 'end': 125}
{'entity': 'B-FOOD', 'score': 0.99993443, 'index': 43, 'word': 'arine', 'start': 125, 'end': 130}
{'entity': 'B-FOOD', 'score': 0.9999249,

['<s>',
 '1',
 'Ġc',
 '.',
 'Ġfirmly',
 'Ġpacked',
 'Ġbrown',
 'Ġsugar',
 ',',
 'Ġ0',
 '.',
 '5',
 'Ġc',
 '.',
 'Ġevapor',
 'ated',
 'Ġmilk',
 ',',
 'Ġ0',
 '.',
 '5',
 'Ġtsp',
 '.',
 'Ġvanilla',
 ',',
 'Ġ0',
 '.',
 '5',
 'Ġc',
 '.',
 'Ġbroken',
 'Ġnuts',
 'Ġ(',
 'pec',
 'ans',
 '),',
 'Ġ2',
 'ĠT',
 'bsp',
 '.',
 'Ġbutter',
 'Ġor',
 'Ġmarg',
 'arine',
 ',',
 'Ġ3',
 '.',
 '5',
 'Ġc',
 '.',
 'Ġbite',
 'Ġsize',
 'Ġshredded',
 'Ġrice',
 'Ġbiscuits',
 '</s>']

In [151]:
out = pipe("1 c. firmly packed brown sugar, 0.5 c. evaporated milk, 0.5 tsp. vanilla, 0.5 c. broken nuts (pecans), 2 Tbsp. butter or margarine, 3.5 c. bite size shredded rice biscuits")
for p in out:
    print(p)

{'entity_group': 'food', 'score': 0.99923706, 'word': ' brown', 'start': 19, 'end': 24}
{'entity_group': 'food', 'score': 0.9984503, 'word': ' nuts', 'start': 88, 'end': 92}
{'entity_group': 'food', 'score': 0.99988854, 'word': ' butter', 'start': 111, 'end': 117}
{'entity_group': 'food', 'score': 0.66647196, 'word': ' margarine,', 'start': 121, 'end': 131}
{'entity_group': 'food', 'score': 0.99954844, 'word': ' rice biscuits', 'start': 158, 'end': 171}


In [152]:
model1.to("cpu")

ner_pipe("1 c. firmly packed brown sugar, 0.5 c. evaporated milk, 0.5 tsp. vanilla, 0.5 c. broken nuts (pecans), 2 Tbsp. butter or margarine, 3.5 c. bite size shredded rice biscuits")
# pipe("1 c. firmly packed brown sugar, 0.5 c. evaporated milk, 0.5 tsp. vanilla, 0.5 c. broken nuts (pecans), 2 Tbsp. butter or margarine, 3.5 c. bite size shredded rice biscuits")

KeyError: 'entity'