In [1]:
from gensim.models import Word2Vec,FastText
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from tensorflow import keras
import re
from nltk.stem import WordNetLemmatizer


from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding,LSTM,Dense,Input,Bidirectional,Attention,Concatenate,TimeDistributed

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)


In [3]:
main_train = pd.read_json("dataset/PIZZA_train.json", lines=True,)
df_dev = pd.read_json("dataset/PIZZA_dev.json", lines=True,)

In [50]:
df_train = main_train.sample(100)

In [6]:
pizza = {"pizza", "pizzas", "pie", "pies"}

In [12]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [13]:
lemmatizer = WordNetLemmatizer()

def lemma(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

In [7]:
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text


In [55]:
word_to_num = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, 
    "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9,
    "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13,
    "fourteen": 14, "fifteen": 15, "sixteen": 16, "seventeen": 17,
    "eighteen": 18, "nineteen": 19, "twenty": 20,
    "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60,
    "seventy": 70, "eighty": 80, "ninety": 90,
    "hundred": 100 ,
      "thousand": 1000, "million": 1000000, "billion": 1000000000
}


def words_to_number(word):
    word = word.lower().strip()
    try:
        # Handle simple numbers directly
        if word in word_to_num:
            return word_to_num[word]
        
        # Handle composite numbers (e.g., twenty-one)
        if "-" in word and word != "-": 
            parts = word.split("-")
            return sum(word_to_num[part] for part in parts if part in word_to_num)
        
        # Handle "hundred" cases (e.g., one hundred twenty-three)
        if "hundred" in word:
            parts = word.split("hundred")
            hundreds = word_to_num[parts[0].strip()] * 100
            if parts[1].strip():  # If there's something after "hundred"
                return hundreds + words_to_number(parts[1].strip())
            return hundreds
    except:
        print("Error in words_to_number when processing", word)
        return None  # Return None if the input is not a valid number word
    
    return None  # Return None if the input is not a valid number word

def standardize_numbers(sentence):
    # Tokenize the sentence
    tokens = sentence.split()
    
    # Replace number words with digits
    standardized_tokens = [
        str(words_to_number(token)) if words_to_number(token) is not None else token
        for token in tokens
    ]
    
    return " ".join(standardized_tokens)





In [14]:
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
    "lack",
    "any",
    "nothing"
}
pizza = {"pizza", "pizzas", "pie", "pies"}

stop_negation_words = {"and", "but"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words - {'all'}
stop_words.update({"would", "like", "get", "want"})
# stop_words.update(pizza)


def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [15]:
def std_negation(text):
    text = text.split()
    for i, word in enumerate(text):
        if word in negation_words:
            text[i] = 'not'
    return " ".join(text)

In [56]:
df_train['train.IN'] = df_train['train.SRC'].apply(clean_text).apply(standardize_numbers).apply(expnad_abb2).apply(remove_stopwords).apply(std_negation)
df_train['train.TOP'] = df_train['train.TOP'].apply(standardize_numbers)

In [17]:
toppings_regex = re.compile(r'(?<=/(TOPPING/s)[^)]*(?=\s)')
number_regex = re.compile(r'(?<=\(NUMBER\s)[^)]*(?=\s)')
size_regex = re.compile(r'(?<=\(SIZE\s)[^)]*(?=\s)')
quantity_regex = re.compile(r'(?<=\(QUANTITY\s)[^)]*(?=\s)')
style_regex = re.compile(r'(?<=\(STYLE\s)[^)]*(?=\s)')
drink_type_regex = re.compile(r'(?<=\(DRINKTYPE\s)[^)]*(?=\s)')
volume_regex = re.compile(r'(?<=\(VOLUME\s)[^)]*(?=\s)')
container_type_regex = re.compile(r'(?<=\(CONTAINERTYPE\s)[^)]*(?=\s)')

In [40]:
def parse_toppings(x,y):
    toppings =  toppings_regex.findall(y)
    for i,topping in enumerate(toppings):
        topping = '<T> ' + topping +' </T>'
        x = x.replace(toppings[i], topping)
    return x

def parse_number(x,y):
    number =  number_regex.findall(y)
    for i,num in enumerate(number):
        num = '<N> ' + num +' </N>'
        x = x.replace(number[i], num)
    return x

def parse_size(x,y):
    size =  size_regex.findall(y)
    for i,s in enumerate(size):
        s = '<S> ' + s +' </S>'
        x = x.replace(size[i], s)
    return x

def parse_quantity(x,y):
    quantity =  quantity_regex.findall(y)
    for i,q in enumerate(quantity):
        q = '<Q> ' + q +' </Q>'
        x = x.replace(quantity[i], q)
    return x

def parse_style(x,y):
    style =  style_regex.findall(y)
    for i,s in enumerate(style):
        s = '<ST> ' + s +' </ST>'
        x = x.replace(style[i], s)
    return x

def parse_drink_type(x,y):
    drink_type =  drink_type_regex.findall(y)
    for i,d in enumerate(drink_type):
        d = '<DT> ' + d +' </DT>'
        x = x.replace(drink_type[i], d)
    return x

def parse_volume(x,y):
    volume =  volume_regex.findall(y)
    for i,v in enumerate(volume):
        v = '<V> ' + v +' </V>'
        x = x.replace(volume[i], v)
    return x

def parse_container_type(x,y):
    container_type =  container_type_regex.findall(y)
    for i,c in enumerate(container_type):
        c = '<CT> ' + c +' </CT>'
        x = x.replace(container_type[i], c)
    return x

def parse_all(x,y):
    x = parse_toppings(x,y)
    x = parse_number(x,y)
    x = parse_size(x,y)
    x = parse_quantity(x,y)
    x = parse_style(x,y)
    x = parse_drink_type(x,y)
    x = parse_volume(x,y)
    x = parse_container_type(x,y)
    return x


In [59]:
x

'4 pizzas balsamic glaze and 5 pies little bit bbq pulled pork'

In [58]:
x = df_train['train.IN'].loc[919073]
y = df_train['train.TOP'].loc[919073]

parse_all(x,y)

'<N> 4 </N> pizzas <T> balsamic glaze </T> and <N> 5 </N> pies little bit <T> bbq pulled pork </T>'

In [None]:
#  for sublist in df_train['train.TOP'].apply(lambda x: size_regex.findall(x))


In [1]:
import keras

In [2]:
import tensorflow as tf

In [3]:
print(tf.__version__)
print(keras.__version__)

2.16.1
3.7.0


In [4]:
model = keras.models.load_model("E:/Collage/NLP/Project/pointer_generator.keras")

In [5]:
model.summary()