<a href="https://colab.research.google.com/github/Dhruv-Limbani/Indian-Food-Recommendation-based-on-Ingredients/blob/master/NER_for_Ingredients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:

https://github.com/taisti/TASTEset/tree/main

In [None]:
!pip install -U spacy
!pip install spacy_transformers

Collecting spacy
  Downloading spacy-3.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Downloading spacy-3.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.0/29.0 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.6
    Uninstalling spacy-3.7.6:
      Successfully uninstalled spacy-3.7.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.7.1 requires spacy<3.8.0,>=3.7.2, but you have spacy 3.8.0 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.8.0
Collecting spacy_transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Colle

In [None]:
# Import required libraries and install any necessary packages
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json
import os

# Check the installed version of spaCy
print(spacy.__version__)

# Check GPU information
!nvidia-smi

3.8.0
Fri Sep 13 02:15:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                              

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Change the working directory to the project folder
%cd "/content/drive/MyDrive/Custom_Ner_For_Ingredients"

Mounted at /content/drive
/content/drive/MyDrive/Custom_Ner_For_Ingredients


In [None]:
!pip install nervaluate

Collecting nervaluate
  Downloading nervaluate-0.2.0-py3-none-any.whl.metadata (15 kB)
Downloading nervaluate-0.2.0-py3-none-any.whl (13 kB)
Installing collected packages: nervaluate
Successfully installed nervaluate-0.2.0


In [None]:
import pandas as pd
import os
import json
import spacy
from spacy.training import biluo_tags_to_offsets, offsets_to_biluo_tags
from nervaluate import Evaluator


NLP = spacy.load('en_core_web_sm')
ENTITIES = ["FOOD", "QUANTITY", "UNIT", "PROCESS", "PHYSICAL_QUALITY", "COLOR",
            "TASTE", "PURPOSE", "PART"]


def prepare_data(taste_set, entities_format="spans"):
    """
    :param tasteset: TASTEset as pd.DataFrame or a path to the TASTEset
    :param entities_format: the format of entities. If equal to 'bio', entities
    will be of the following format: [[B-FOOD, I-FOOD, O, ...], [B-UNIT, ...]].
    If equal to span, entities will be of the following format:
    [[(0, 6, FOOD), (10, 15, PROCESS), ...], [(0, 2, UNIT), ...]]
    :return: list of recipes and corresponding list of entities
    """

    assert entities_format in ["bio", "spans"],\
        'You provided incorrect entities format!'
    if isinstance(taste_set, pd.DataFrame):
        df = taste_set
    elif isinstance(taste_set, str) and os.path.exists(taste_set):
        df = pd.read_csv(taste_set)
    else:
        raise ValueError('Incorret TASTEset format!')

    all_recipes = df["ingredients"].to_list()
    all_entities = []

    for idx in df.index:
        ingredients_entities = json.loads(df.at[idx, "ingredients_entities"])
        entities = []

        for entity_dict in ingredients_entities:
            entities.append((entity_dict["start"], entity_dict["end"],
                             entity_dict["type"]))

        if entities_format == "bio":
            tokenized_recipe, entities = span_to_bio(all_recipes[idx], entities)
            all_recipes[idx] = tokenized_recipe

        all_entities.append(entities)

    return all_recipes, all_entities


def bio_to_biluo(bio_entities):
    """
    :param bio_entities: list of BIO entities, eg. ["O", "B-FOOD", "I-FOOD",
    "B-PROCESS"]
    :return: list of BILUO entities, eg. ["O", "B-FOOD", "L-FOOD", "U-PROCESS"]
    """
    biluo_entities = []

    for entity_idx in range(len(bio_entities)):
        cur_entity = bio_entities[entity_idx]
        next_entity = bio_entities[entity_idx + 1] if \
            entity_idx < len(bio_entities) - 1 else ""

        if cur_entity.startswith("B-"):
            if next_entity.startswith("I-"):
                biluo_entities.append(cur_entity)
            else:
                biluo_entities.append(re.sub("B-", "U-", cur_entity))
        elif cur_entity.startswith("I-"):
            if next_entity.startswith("I-"):
                biluo_entities.append(cur_entity)
            else:
                biluo_entities.append(re.sub("I-", "L-", cur_entity))
        else:  # O
            biluo_entities.append(cur_entity)

    return biluo_entities


def biluo_to_span(recipe, biluo_entities):
    """
    :param biluo_entities: list of BILUO entities, eg. ["O", "B-FOOD", "L-FOOD",
    "U-PROCESS"]
    :return: list of span entities, eg. [(span_start, span_end, "FOOD"),
    (span_start, span_end, "PROCESS")]
    """
    doc = NLP(recipe)
    spans = biluo_tags_to_offsets(doc, biluo_entities)
    return spans


def bio_to_span(recipe, bio_entities):
    """
    :param bio_entities: list of BIO entities, eg. ["O", "B-FOOD", "I-FOOD",
    "B-PROCESS"]
    :return: list of span entities, eg. [(span_start, span_end, "FOOD"),
    (span_start, span_end, "PROCESS")]
    """
    biluo_entities = bio_to_biluo(bio_entities)
    spans = biluo_to_span(recipe, biluo_entities)
    return spans


def span_to_biluo(recipe, span_entities):
    """
    :param span_entities: list of span entities, eg. [(span_start, span_end,
    "FOOD"), (span_start, span_end, "PROCESS")]
    :return: list of BILUO entities, eg. ["O", "B-FOOD", "L-FOOD",
    "U-PROCESS"] along with tokenized recipe
    """
    doc = NLP(recipe.replace("\n", " "))
    tokenized_recipe = [token.text for token in doc]
    spans = offsets_to_biluo_tags(doc, span_entities)
    return tokenized_recipe, spans


def biluo_to_bio(biluo_entities):
    """
    :param biluo_entities: list of BILUO entities, eg. ["O", "B-FOOD", "L-FOOD",
    "U-PROCESS"]
    :return: list of BIO entities, eg. ["O", "B-FOOD", "I-FOOD", "B-PROCESS"]
    """
    bio_entities = [entity.replace("L-", "I-").replace("U-", "B-")
                    for entity in biluo_entities]
    return bio_entities


def span_to_bio(recipe, span_entities):
    """
    :param span_entities: list of span entities, eg. [(span_start, span_end,
    "FOOD"), (span_start, span_end, "PROCESS")]
    :return: list of BIO entities, eg. ["O", "B-FOOD", "I-FOOD", "B-PROCESS"]
    """
    tokenized_recipe, biluo_entities = span_to_biluo(recipe, span_entities)
    bio_entities = biluo_to_bio(biluo_entities)
    return tokenized_recipe, bio_entities


def spans_to_prodigy_spans(list_of_spans):
    """
    Convert to spans format required by nerevaluate.
    """
    prodigy_list_of_spans = []
    for spans in list_of_spans:
        prodigy_spans = []
        for start, end, entity in spans:
            prodigy_spans.append({"label": entity, "start": start, "end": end})
        prodigy_list_of_spans.append(prodigy_spans)

    return prodigy_list_of_spans


def evaluate_predictions(true_entities, pred_entities, entities_format):
    """
    :param true_entities: list of true entities
    :param pred_entities: list of predicted entities
    :param format: format of provided entities. If equal to 'bio', entities
    are expected of the following format: [[B-FOOD, I-FOOD, O, ...],
    [B-UNIT, ...]]. If equal to span, entities are expected of the following
    format: [[(0, 6, FOOD), (10, 15, PROCESS), ...], [(0, 2, UNIT), ...]]
    :return: metrics for the predicted entities
    """

    assert entities_format in ["bio", "spans"],\
        'You provided incorrect entities format!'

    if entities_format == "spans":
        true_entities = spans_to_prodigy_spans(true_entities)
        pred_entities = spans_to_prodigy_spans(pred_entities)

        evaluator = Evaluator(true_entities, pred_entities, tags=ENTITIES)
    else:
        evaluator = Evaluator(true_entities, pred_entities, tags=ENTITIES,
                              loader="list")

    results, results_per_tag = evaluator.evaluate()

    results = results["strict"]

    for entity in results_per_tag.keys():
        results_per_tag[entity] = results_per_tag[entity]["strict"]

    results_per_tag["all"] = results
    return results_per_tag

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
import pandas as pd

df = pd.read_csv('TASTEset.csv',index_col=0)
df.head()

Unnamed: 0_level_0,ingredients_entities
ingredients,Unnamed: 1_level_1
5 ounces rum\n4 ounces triple sec\n3 ounces Tia Maria\n20 ounces orange juice\n,"[{""start"": 0, ""end"": 1, ""type"": ""QUANTITY"", ""e..."
"2 tubes cinnamon roll, refrigerated, with icing\n4 tablespoons butter, melted\n6 eggs\n½ cup milk\n2 teaspoons cinnamon\n2 teaspoons vanilla\n1 cup maple syrup\n","[{""start"": 0, ""end"": 1, ""type"": ""QUANTITY"", ""e..."
4 ripe coconuts\n1 cup evaporated milk\n1 cup gin\n3 tablespoons sugar (optional)\n1 teaspoon ground cinnamon\n1⁄2 teaspoon freshly grated nutmeg\n,"[{""start"": 0, ""end"": 1, ""type"": ""QUANTITY"", ""e..."
1 sheet graham cracker (broken in half)\n2 pieces milk chocolate candy bars\n1 marshmallows\n6 -8 fresh blueberries\n2 thick strawberry slices\n,"[{""start"": 0, ""end"": 1, ""type"": ""QUANTITY"", ""e..."
1 (8 ounce) package crescent rolls\n8 slices deli ham\n4 teaspoons prepared mustard\n1 cup shredded swiss cheese or 1 cup cheddar cheese\n2 tablespoons sesame seeds (optional)\n,"[{""start"": 0, ""end"": 1, ""type"": ""QUANTITY"", ""e..."


In [None]:
recipes, entities = prepare_data("TASTEset.csv")

In [None]:
recipes[0], entities[0]

('5 ounces rum\n4 ounces triple sec\n3 ounces Tia Maria\n20 ounces orange juice\n',
 [(0, 1, 'QUANTITY'),
  (2, 8, 'UNIT'),
  (9, 12, 'FOOD'),
  (13, 14, 'QUANTITY'),
  (15, 21, 'UNIT'),
  (22, 32, 'FOOD'),
  (33, 34, 'QUANTITY'),
  (35, 41, 'UNIT'),
  (42, 51, 'FOOD'),
  (52, 54, 'QUANTITY'),
  (55, 61, 'UNIT'),
  (62, 74, 'FOOD')])

In [None]:
TD = []
for i in range(len(recipes)):
  TD.append([recipes[i],{'entities':[[t for t in x] for x in entities[i]]}])

In [None]:
TD[0]

['5 ounces rum\n4 ounces triple sec\n3 ounces Tia Maria\n20 ounces orange juice\n',
 {'entities': [[0, 1, 'QUANTITY'],
   [2, 8, 'UNIT'],
   [9, 12, 'FOOD'],
   [13, 14, 'QUANTITY'],
   [15, 21, 'UNIT'],
   [22, 32, 'FOOD'],
   [33, 34, 'QUANTITY'],
   [35, 41, 'UNIT'],
   [42, 51, 'FOOD'],
   [52, 54, 'QUANTITY'],
   [55, 61, 'UNIT'],
   [62, 74, 'FOOD']]}]

In [None]:
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  for text, entities in TD:
    doc = nlp.make_doc(text)
    ents = []

    for start, end, label in entities['entities']:
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [None]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(TD, test_size=0.2)

# Display the number of items in the training and testing sets
len(train), len(test)

# Open a file to log errors during annotation processing
file = open('/content/drive/MyDrive/Custom_Ner_For_Ingredients/train_file.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train)
db.to_disk('/content/drive/MyDrive/Custom_Ner_For_Ingredients/train_data.spacy')

db = get_spacy_doc(file, test)
db.to_disk('/content/drive/MyDrive/Custom_Ner_For_Ingredients/test_data.spacy')

# Close the error log file
file.close()



In [None]:
!python -m spacy init fill-config /content/drive/MyDrive/Custom_Ner_For_Ingredients/base_config.cfg /content/drive/MyDrive/Custom_Ner_For_Ingredients/default_config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/Custom_Ner_For_Ingredients/default_config.cfg
You can now add your data and train your pipeline:
python -m spacy train default_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
# Train a spaCy NER model using the provided configuration and data
!python -m spacy train /content/drive/MyDrive/Custom_Ner_For_Ingredients/default_config.cfg  --output /content/drive/MyDrive/Custom_Ner_For_Ingredients/output  --paths.train /content/drive/MyDrive/Custom_Ner_For_Ingredients/train_data.spacy  --paths.dev /content/drive/MyDrive/Custom_Ner_For_Ingredients/test_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/Custom_Ner_For_Ingredients/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 178kB/s]
config.json: 100% 481/481 [00:00<00:00, 3.91MB/s]
vocab.json: 100% 899k/899k [00:00<00:00, 3.41MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 2.29MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 4.10MB/s]
  _torch_pytree._register_pytree_node(
model.safetensors: 100% 499M/499M [00:09<00:00, 51.6MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast(self._mixed_precision):
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer'

In [None]:
nlp = spacy.load('/content/drive/MyDrive/Custom_Ner_For_Ingredients/output/model-best')


  _torch_pytree._register_pytree_node(
  self._model.load_state_dict(torch.load(filelike, map_location=device))


In [None]:
strg = "6 Karela (Bitter Gourd/ Pavakkai) - deseeded,Salt - to taste,1 Onion - thinly sliced,3 tablespoon Gram flour (besan),2 teaspoons Turmeric powder (Haldi),1 tablespoon Red Chilli powder,2 teaspoons Cumin seeds (Jeera),1 tablespoon Coriander Powder (Dhania),1 tablespoon Amchur (Dry Mango Powder),Sunflower Oil - as required"
# Process the extracted text using the loaded spaCy NER model
# strg = strg.replace("-", " ").replace(","," ")
doc = nlp(strg)

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

6   ->>>>   QUANTITY
Karela   ->>>>   FOOD
deseeded   ->>>>   PROCESS
Salt   ->>>>   FOOD
to taste,1   ->>>>   PURPOSE
Onion   ->>>>   FOOD
thinly sliced,3   ->>>>   PROCESS
tablespoon   ->>>>   UNIT
Gram flour   ->>>>   FOOD
teaspoons   ->>>>   UNIT
Turmeric powder   ->>>>   FOOD
tablespoon   ->>>>   UNIT
Red Chilli powder,2   ->>>>   FOOD
teaspoons   ->>>>   UNIT
Cumin seeds   ->>>>   FOOD
tablespoon   ->>>>   UNIT
Coriander Powder   ->>>>   FOOD
tablespoon   ->>>>   UNIT
Amchur   ->>>>   FOOD
Oil   ->>>>   FOOD


In [None]:
nlp.to_disk("/content/drive/MyDrive/Custom_Ner_For_Ingredients/final_model")