In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from enum import Enum

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier



pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text


In [3]:
text = '(ORDER (PIZZAORDER (SIZE party size ) pie with (TOPPING american cheese ) and (COMPLEX_TOPPING (QUANTITY a little bit of ) (TOPPING peppperoni ) ) ) )'
def get_none_match(text):
    order_regex = re.compile(r'(?<=ORDER\s)[^(]*(?=\s\()')
    pizzaorder_regex = re.compile(r'(?<=PIZZAORDER\s)[^(]*(?=\s\()')
    drinkorder_regex = re.compile(r'(?<=DRINKORDER\s)[^(]*(?=\s\()')
    between_parentheses_regex = re.compile(r'(?<=\)\s)[^()]+(?=\s\()')
    uncleaned_none_match = re.findall(order_regex, text)
    uncleaned_none_match.extend(re.findall(pizzaorder_regex, text))
    uncleaned_none_match.extend(re.findall(drinkorder_regex, text))
    uncleaned_none_match.extend(re.findall(between_parentheses_regex, text))
    none_match=[]
    for sentence in uncleaned_none_match:
        sen = expnad_abb2(sentence).upper()
        none_match.extend(sen.split())
    return none_match
print(get_none_match(text))

['PIE', 'WITH', 'AND']


In [9]:
dataset_dir = "E:/Collage/NLP/Project/NLP-Project/dataset/"

class Label(Enum):
    TOPPING = 0
    NUMBER = 1
    SIZE = 2
    QUANTITY = 3
    STYLE = 4
    DRINKTYPE = 5
    CONTAINERTYPE = 6
    NONE = 7

In [10]:
df_train = pd.read_json(dataset_dir + "PIZZA_train.json", lines=True)
df_dev = pd.read_json(dataset_dir + "PIZZA_dev.json", lines=True)

In [11]:
train_top = df_train["train.TOP"]
train_exr = df_train["train.EXR"]

dev_top = df_dev["dev.TOP"]
dev_exr = df_dev["dev.EXR"]

In [12]:
#Why dict? 1- To keep the order of the data 2- To ensure unique values
train_data = {}
dev_data = {}

In [13]:
toppings_regex = re.compile(r'(?<=\(TOPPING\s)[^)]*(?=\s)')
number_regex = re.compile(r'(?<=\(NUMBER\s)[^)]*(?=\s)')
size_regex = re.compile(r'(?<=\(SIZE\s)[^)]*(?=\s)')
quantity_regex = re.compile(r'(?<=\(QUANTITY\s)[^)]*(?=\s)')
style_regex = re.compile(r'(?<=\(STYLE\s)[^)]*(?=\s)')
drink_type_regex = re.compile(r'(?<=\(DRINKTYPE\s)[^)]*(?=\s)')
container_type_regex = re.compile(r'(?<=\(CONTAINERTYPE\s)[^)]*(?=\s)')   

In [14]:
def apply_regex(regex, text):
    match = re.findall(regex, text)
    return match

In [15]:
for i in range(len(train_top)):
    l = get_none_match( train_top.loc[i])
    train_data.update({e : Label.NONE.value for e in l})
    
for i in range(len(dev_top)):
    l = get_none_match( dev_top.loc[i])
    dev_data.update({e : Label.NONE.value for e in l})


In [16]:
for i in range(len(train_top)):
    l = apply_regex(toppings_regex, train_top.loc[i])
    train_data.update({e.upper() : Label.TOPPING.value for e in l})
    l = apply_regex(toppings_regex, train_exr.loc[i])
    train_data.update({e.upper() : Label.TOPPING.value for e in l})
    
for i in range(len(dev_top)):
    l = apply_regex(toppings_regex, dev_top.loc[i])
    dev_data.update({e.upper() : Label.TOPPING.value for e in l})
    l = apply_regex(toppings_regex, dev_exr.loc[i])
    dev_data.update({e.upper() : Label.TOPPING.value for e in l})

In [17]:
for i in range(len(train_top)):
    l = apply_regex(number_regex, train_top.loc[i])
    train_data.update({e.upper() : Label.NUMBER.value for e in l})
    l = apply_regex(number_regex, train_exr.loc[i])
    train_data.update({e.upper() : Label.NUMBER.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(number_regex, dev_top.loc[i])
    dev_data.update({e.upper() : Label.NUMBER.value for e in l})
    l = apply_regex(number_regex, dev_exr.loc[i])
    dev_data.update({e.upper() : Label.NUMBER.value for e in l})


In [18]:
for i in range(len(train_top)):
    l = apply_regex(quantity_regex, train_top.loc[i])
    train_data.update({e.upper() : Label.QUANTITY.value for e in l})
    l = apply_regex(quantity_regex, train_exr.loc[i])
    train_data.update({e.upper() : Label.QUANTITY.value for e in l})


for i in range(len(dev_top)):
    l = apply_regex(quantity_regex, dev_top.loc[i])
    dev_data.update({e.upper() : Label.QUANTITY.value for e in l})
    l = apply_regex(quantity_regex, dev_exr.loc[i])
    dev_data.update({e.upper() : Label.QUANTITY.value for e in l})


In [19]:
for i in range(len(train_top)):
    l = apply_regex(size_regex, train_top.loc[i])
    train_data.update({e.upper() : Label.SIZE.value for e in l})
    l = apply_regex(size_regex, train_exr.loc[i])
    train_data.update({e.upper() : Label.SIZE.value for e in l})


for i in range(len(dev_top)):
    l = apply_regex(size_regex, dev_top.loc[i])
    dev_data.update({e.upper() : Label.SIZE.value for e in l})
    l = apply_regex(size_regex, dev_exr.loc[i])
    dev_data.update({e.upper() : Label.SIZE.value for e in l})

In [20]:
for i in range(len(train_top)):
    l = apply_regex(drink_type_regex, train_top.loc[i])
    train_data.update({e.upper() : Label.DRINKTYPE.value for e in l})
    l = apply_regex(drink_type_regex, train_exr.loc[i])
    train_data.update({e.upper() : Label.DRINKTYPE.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(drink_type_regex, dev_top.loc[i])
    dev_data.update({e.upper() : Label.DRINKTYPE.value for e in l})
    l = apply_regex(drink_type_regex, dev_exr.loc[i])
    dev_data.update({e.upper() : Label.DRINKTYPE.value for e in l})

In [21]:
for i in range(len(train_top)):
    l = apply_regex(style_regex, train_top.loc[i])
    train_data.update({e.upper() : Label.STYLE.value for e in l})
    l = apply_regex(style_regex, train_exr.loc[i])
    train_data.update({e.upper() : Label.STYLE.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(style_regex, dev_top.loc[i])
    dev_data.update({e.upper() : Label.STYLE.value for e in l})
    l = apply_regex(style_regex, dev_exr.loc[i])
    dev_data.update({e.upper() : Label.STYLE.value for e in l})

In [22]:
for i in range(len(train_top)):
    l = apply_regex(container_type_regex, train_top.loc[i])
    train_data.update({e.upper() : Label.CONTAINERTYPE.value for e in l})
    l = apply_regex(container_type_regex, train_exr.loc[i])
    train_data.update({e.upper() : Label.CONTAINERTYPE.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(container_type_regex, dev_top.loc[i])
    dev_data.update({e.upper() : Label.CONTAINERTYPE.value for e in l})
    l = apply_regex(container_type_regex, dev_exr.loc[i])
    dev_data.update({e.upper() : Label.CONTAINERTYPE.value for e in l})

In [23]:
#Actually it's small :))

len(train_data), len(dev_data)

(453, 265)

In [24]:
train_data_df = pd.DataFrame(train_data.items(), columns=["word", "label"])
dev_data_df = pd.DataFrame(dev_data.items(), columns=["word", "label"])

X_train, X_test, y_train, y_test = train_data_df["word"], dev_data_df["word"], train_data_df["label"], dev_data_df["label"]


In [25]:
import math
from collections import Counter
from nltk.util import bigrams
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

def pmi_find(corpus):
    # Step 2: Calculate Frequencies
    word_freq = Counter(corpus)
    bigram_freq = Counter(bigrams(corpus))

    # Step 3: Calculate Probabilities
    total_words = len(corpus)
    p_word = {word: freq / total_words for word, freq in word_freq.items()}

    total_bigrams = total_words - 1
    p_bigram = {bigram: (freq+1) / (word_freq[bigram[0]] * word_freq[bigram[1]]+len(train_data.values())) for bigram, freq in bigram_freq.items()}

    # Step 4: Calculate PMI for all Bigrams
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    all_pmi_scores = {bigram: bigram_finder.score_ngram(BigramAssocMeasures.pmi, bigram[0], bigram[1]) for bigram in p_bigram}
    return all_pmi_scores


In [26]:
new_train_data = [e for e in train_data.keys()]

pmi = pmi_find(new_train_data)
print(pmi)

{('CAN', 'I'): 8.823367240046235, ('I', 'HAVE'): 8.823367240046235, ('HAVE', 'PIE'): 8.823367240046235, ('PIE', 'WITH'): 8.823367240046235, ('WITH', 'AND'): 8.823367240046235, ('AND', 'WOULD'): 8.823367240046235, ('WOULD', 'LIKE'): 8.823367240046235, ('LIKE', 'WANT'): 8.823367240046235, ('WANT', 'PIZZA'): 8.823367240046235, ('PIZZA', 'WITHOUT'): 8.823367240046235, ('WITHOUT', 'ANY'): 8.823367240046235, ('ANY', 'NEED'): 8.823367240046235, ('NEED', 'NO'): 8.823367240046235, ('NO', 'PIZZAS'): 8.823367240046235, ('PIZZAS', 'AVOID'): 8.823367240046235, ('AVOID', 'HOLD'): 8.823367240046235, ('HOLD', 'THE'): 8.823367240046235, ('THE', 'PIES'): 8.823367240046235, ('PIES', 'HATE'): 8.823367240046235, ('HATE', 'OF'): 8.823367240046235, ('OF', 'ALSO'): 8.823367240046235, ('ALSO', 'BBQ PULLED PORK'): 8.823367240046235, ('BBQ PULLED PORK', 'BBQ_PULLED_PORK'): 8.823367240046235, ('BBQ_PULLED_PORK', 'GREEN PEPPER'): 8.823367240046235, ('GREEN PEPPER', 'PEPERONNI'): 8.823367240046235, ('PEPERONNI', 'G

In [27]:
train_data_df = pd.DataFrame(train_data.items(), columns=["word", "label"])
dev_data_df = pd.DataFrame(dev_data.items(), columns=["word", "label"])

X_train, X_test, y_train, y_test = train_data_df["word"], dev_data_df["word"], train_data_df["label"], dev_data_df["label"]

In [None]:
pmi_train = pmi_find()

In [28]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [29]:
# We need to balance the classes as with the current dataset the model will be biased towards the majority class --> TOPPING
# When I tried to train the model without balancing the classes, the model was predicting only TOPPING :)
 
classifier_lr = LogisticRegression(class_weight='balanced', random_state=42)

classifier_lr.fit(X_train_tfidf, y_train)
predictions = classifier_lr.predict(X_test_tfidf)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.38      0.98      0.55        63
           1       0.92      0.55      0.69        20
           2       0.80      0.89      0.84         9
           3       0.50      0.60      0.55         5
           4       0.78      1.00      0.88        18
           5       0.97      0.97      0.97        30
           6       0.75      1.00      0.86         3
           7       1.00      0.15      0.27       117

    accuracy                           0.57       265
   macro avg       0.76      0.77      0.70       265
weighted avg       0.81      0.57      0.52       265



In [37]:
classifier_svm = SVC(class_weight='balanced', random_state=42)
classifier_svm.fit(X_train_tfidf, y_train)
predictions = classifier_svm.predict(X_test_tfidf)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.95      0.98      0.97        63
           1       0.18      1.00      0.30        20
           2       0.80      0.89      0.84         9
           3       0.50      0.40      0.44         5
           4       0.86      1.00      0.92        18
           5       1.00      0.97      0.98        30
           6       0.75      1.00      0.86         3
           7       0.95      0.16      0.28       117

    accuracy                           0.61       265
   macro avg       0.75      0.80      0.70       265
weighted avg       0.88      0.61      0.60       265



In [38]:
classifier_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300,random_state=42)
classifier_mlp.fit(X_train_tfidf, y_train)
predictions = classifier_mlp.predict(X_test_tfidf)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       1.00      0.98      0.99        63
           1       0.18      1.00      0.30        20
           2       0.80      0.89      0.84         9
           3       0.50      0.40      0.44         5
           4       0.78      1.00      0.88        18
           5       1.00      0.97      0.98        30
           6       0.75      1.00      0.86         3
           7       0.95      0.16      0.28       117

    accuracy                           0.61       265
   macro avg       0.74      0.80      0.70       265
weighted avg       0.88      0.61      0.60       265



In [39]:
classifier_rf = RandomForestClassifier(class_weight='balanced',random_state=42)
classifier_rf.fit(X_train_tfidf, y_train)
predictions = classifier_rf.predict(X_test_tfidf)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        63
           1       0.17      1.00      0.29        20
           2       0.80      0.89      0.84         9
           3       1.00      0.40      0.57         5
           4       0.90      1.00      0.95        18
           5       1.00      0.97      0.98        30
           6       0.75      1.00      0.86         3
           7       0.95      0.16      0.28       117

    accuracy                           0.61       265
   macro avg       0.82      0.80      0.72       265
weighted avg       0.90      0.61      0.60       265



    TOPPING = 0
    NUMBER = 1
    SIZE = 2
    QUANTITY = 3
    STYLE = 4
    DRINKTYPE = 5
    CONTAINERTYPE = 6
    NONE = 7

In [None]:
text = 'want to eat one party size balsamic glaze'.upper()
text  =  text.split()
vectorizer.transform(text)
print(classifier_svm.predict( vectorizer.transform(text)))

[7 1 1 1 2 2 0]
