In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Natural Language Toolkit (nltk) 

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
user_intent="I want 10g fats content, 60g calorie content, 500g sugar content. The dish should be non veg."

### Remove punctuation

In [4]:
import string
string.punctuation
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
punc=remove_punctuation(user_intent)
print(punc)

I want 10g fats content 60g calorie content 500g sugar content The dish should be non veg


### Convert to lower

In [5]:
low=punc.lower()
print(low)

i want 10g fats content 60g calorie content 500g sugar content the dish should be non veg


### Tokenization

In [10]:
import re

def tokenization(text):
    result = word_tokenize(text)
    return result


tokens = tokenization(low)
print(tokens)


['i', 'want', '10g', 'fats', 'content', '60g', 'calorie', 'content', '500g', 'sugar', 'content', 'the', 'dish', 'should', 'be', 'non', 'veg']


### Stopwords Removal

In [11]:
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
remove_stop=remove_stopwords(tokens)
print(remove_stop)

['want', '10g', 'fats', 'content', '60g', 'calorie', 'content', '500g', 'sugar', 'content', 'dish', 'non', 'veg']


### POS- Tagging

In [12]:
final = nltk.pos_tag(tokens)
print(final)

[('i', 'NN'), ('want', 'VBP'), ('10g', 'CD'), ('fats', 'NNS'), ('content', 'JJ'), ('60g', 'CD'), ('calorie', 'NN'), ('content', 'NN'), ('500g', 'CD'), ('sugar', 'NN'), ('content', 'NN'), ('the', 'DT'), ('dish', 'NN'), ('should', 'MD'), ('be', 'VB'), ('non', 'JJ'), ('veg', 'NN')]


### Adjective-Noun Extraction
This code uses NLTK to extract noun phrases from tagged text based on a defined grammar pattern. It identifies pairs of adjectives/CD and nouns or hyphenated nouns within the extracted noun phrases and prints them.

In [13]:

from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser  # Import RegexpParser



# Define a grammar pattern for NP (noun phrase) that consists of an optional adjective (JJ) or cardinal number (CD) followed by a noun (NN) or a hyphenated noun (NN-NN)
grammar = r'NP: {<JJ|CD>?<NN|NNS>-<NN|NNS>|<JJ|CD>?<NN|NNS>}'

# Create a RegexpParser with the defined grammar
chunk_parser = RegexpParser(grammar)

# Parse the tagged text to extract noun phrases
tree = chunk_parser.parse(final)

# Initialize an empty list to store the adjective/CD-noun or hyphenated noun pairs
pairs = []

# Traverse the parse tree to find noun phrases and extract adjectives/CD and nouns or hyphenated nouns
for subtree in tree.subtrees():
    if subtree.label() == 'NP':
        # Get the words in the noun phrase
        words = [word for word, pos in subtree.leaves()]
        # Find and extract the adjective/CD (if it exists)
        adj_cd = [word for word, pos in subtree.leaves() if pos in ['JJ', 'CD']]
        # Find and extract the noun or hyphenated noun
        noun = ' '.join(word for word, pos in subtree.leaves() if pos in ['NN', 'NNS'])
        if noun:
            # If adjective/CD and noun or hyphenated noun are found, add them to the pairs list
            if adj_cd:
                pairs.append((adj_cd, noun))
            else:
                # If there is no adjective or CD, add a placeholder (None)
                pairs.append((None, noun))

# Now, pairs contains the pairs of adjectives/CD or hyphenated nouns with nouns
for pair in pairs:
    print(pair)


(None, 'i')
(['10g'], 'fats')
(['60g'], 'calorie')
(None, 'content')
(['500g'], 'sugar')
(None, 'content')
(None, 'dish')
(['non'], 'veg')


### Converting to input form
The code extracts nutritional information and veg-nonveg status from pairs, updating a dictionary and printing the results.

In [14]:
# Initialize your dictionary with keys and set their values to None
nutrition_data = {
    "Calories": None,
    "FatsContent": None,
    "CholesterolContent": None,
    "SodiumContent": None,
    "CarbohydrateContent": None,
    "FiberContent": None,
    "SugarContent": None,
    "ProteinContent": None,
    "veg-nonveg": None,
    "ingredients": None
}


# Variable to track if "veg" or "non" is found in the pairs
veg_found = False
non_found = False

# Update the dictionary values based on the pairs
for adjective_cd, noun in pairs:
    if noun and adjective_cd:
        noun_lower = noun.lower()  # Convert to lowercase for case-insensitive comparison
        for key in nutrition_data.keys():
            if noun_lower in key.lower():
                nutrition_data[key] = adjective_cd[0]
        if "veg" in noun_lower:
            veg_found = True
        elif "non" in noun_lower:
            non_found = True

# Update "veg-nonveg" based on the presence of "veg" or "non"
if veg_found and not non_found:
    nutrition_data["veg-nonveg"] = "veg"
elif non_found and not veg_found:
    nutrition_data["veg-nonveg"] = "nonveg"

# Print the updated dictionary
print("Updated Nutrition Data:")
for key, value in nutrition_data.items():
    print(f"{key}: {value}")

Updated Nutrition Data:
Calories: 60g
FatsContent: 10g
CholesterolContent: None
SodiumContent: None
CarbohydrateContent: None
FiberContent: None
SugarContent: 500g
ProteinContent: None
veg-nonveg: veg
ingredients: None
