In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from enum import Enum

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier



pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
dataset_dir = "dataset/"

class Label(Enum):
    TOPPING = 0
    NUMBER = 1
    SIZE = 2
    QUANTITY = 3
    STYLE = 4
    DRINKTYPE = 5
    CONTAINERTYPE = 6

In [3]:
df_train = pd.read_json(dataset_dir + "PIZZA_train.json", lines=True)
df_dev = pd.read_json(dataset_dir + "PIZZA_dev.json", lines=True)

In [4]:
train_top = df_train["train.TOP"]
train_exr = df_train["train.EXR"]

dev_top = df_dev["dev.TOP"]
dev_exr = df_dev["dev.EXR"]

In [None]:
#Why dict? 1- To keep the order of the data 2- To ensure unique values
train_data = {}
dev_data = {}

In [6]:
toppings_regex = re.compile(r'(?<=\(TOPPING\s)[^)]*(?=\s)')
number_regex = re.compile(r'(?<=\(NUMBER\s)[^)]*(?=\s)')
size_regex = re.compile(r'(?<=\(SIZE\s)[^)]*(?=\s)')
quantity_regex = re.compile(r'(?<=\(QUANTITY\s)[^)]*(?=\s)')
style_regex = re.compile(r'(?<=\(STYLE\s)[^)]*(?=\s)')
drink_type_regex = re.compile(r'(?<=\(DRINKTYPE\s)[^)]*(?=\s)')
container_type_regex = re.compile(r'(?<=\(CONTAINERTYPE\s)[^)]*(?=\s)')   

In [7]:
def apply_regex(regex, text):
    match = re.findall(regex, text)
    return match

In [8]:
for i in range(len(train_top)):
    l = apply_regex(toppings_regex, train_top.loc[i])
    train_data.update({e : Label.TOPPING.value for e in l})
    l = apply_regex(toppings_regex, train_exr.loc[i])
    train_data.update({e : Label.TOPPING.value for e in l})
    
for i in range(len(dev_top)):
    l = apply_regex(toppings_regex, dev_top.loc[i])
    dev_data.update({e : Label.TOPPING.value for e in l})
    l = apply_regex(toppings_regex, dev_exr.loc[i])
    dev_data.update({e : Label.TOPPING.value for e in l})

In [9]:
for i in range(len(train_top)):
    l = apply_regex(number_regex, train_top.loc[i])
    train_data.update({e : Label.NUMBER.value for e in l})
    l = apply_regex(number_regex, train_exr.loc[i])
    train_data.update({e : Label.NUMBER.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(number_regex, dev_top.loc[i])
    dev_data.update({e : Label.NUMBER.value for e in l})
    l = apply_regex(number_regex, dev_exr.loc[i])
    dev_data.update({e : Label.NUMBER.value for e in l})


In [10]:
for i in range(len(train_top)):
    l = apply_regex(quantity_regex, train_top.loc[i])
    train_data.update({e : Label.QUANTITY.value for e in l})
    l = apply_regex(quantity_regex, train_exr.loc[i])
    train_data.update({e : Label.QUANTITY.value for e in l})


for i in range(len(dev_top)):
    l = apply_regex(quantity_regex, dev_top.loc[i])
    dev_data.update({e : Label.QUANTITY.value for e in l})
    l = apply_regex(quantity_regex, dev_exr.loc[i])
    dev_data.update({e : Label.QUANTITY.value for e in l})


In [11]:
for i in range(len(train_top)):
    l = apply_regex(size_regex, train_top.loc[i])
    train_data.update({e : Label.SIZE.value for e in l})
    l = apply_regex(size_regex, train_exr.loc[i])
    train_data.update({e : Label.SIZE.value for e in l})


for i in range(len(dev_top)):
    l = apply_regex(size_regex, dev_top.loc[i])
    dev_data.update({e : Label.SIZE.value for e in l})
    l = apply_regex(size_regex, dev_exr.loc[i])
    dev_data.update({e : Label.SIZE.value for e in l})

In [12]:
for i in range(len(train_top)):
    l = apply_regex(drink_type_regex, train_top.loc[i])
    train_data.update({e : Label.DRINKTYPE.value for e in l})
    l = apply_regex(drink_type_regex, train_exr.loc[i])
    train_data.update({e : Label.DRINKTYPE.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(drink_type_regex, dev_top.loc[i])
    dev_data.update({e : Label.DRINKTYPE.value for e in l})
    l = apply_regex(drink_type_regex, dev_exr.loc[i])
    dev_data.update({e : Label.DRINKTYPE.value for e in l})

In [13]:
for i in range(len(train_top)):
    l = apply_regex(style_regex, train_top.loc[i])
    train_data.update({e : Label.STYLE.value for e in l})
    l = apply_regex(style_regex, train_exr.loc[i])
    train_data.update({e : Label.STYLE.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(style_regex, dev_top.loc[i])
    dev_data.update({e : Label.STYLE.value for e in l})
    l = apply_regex(style_regex, dev_exr.loc[i])
    dev_data.update({e : Label.STYLE.value for e in l})

In [14]:
for i in range(len(train_top)):
    l = apply_regex(container_type_regex, train_top.loc[i])
    train_data.update({e : Label.CONTAINERTYPE.value for e in l})
    l = apply_regex(container_type_regex, train_exr.loc[i])
    train_data.update({e : Label.CONTAINERTYPE.value for e in l})

for i in range(len(dev_top)):
    l = apply_regex(container_type_regex, dev_top.loc[i])
    dev_data.update({e : Label.CONTAINERTYPE.value for e in l})
    l = apply_regex(container_type_regex, dev_exr.loc[i])
    dev_data.update({e : Label.CONTAINERTYPE.value for e in l})

In [None]:
#Actually it's small :))

len(train_data), len(dev_data)

(489, 188)

In [16]:
train_data_df = pd.DataFrame(train_data.items(), columns=["word", "label"])
dev_data_df = pd.DataFrame(dev_data.items(), columns=["word", "label"])

X_train, X_test, y_train, y_test = train_data_df["word"], dev_data_df["word"], train_data_df["label"], dev_data_df["label"]


In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [26]:
# We need to balance the classes as with the current dataset the model will be biased towards the majority class --> TOPPING
# When I tried to train the model without balancing the classes, the model was predicting only TOPPING :)
 
classifier_lr = LogisticRegression(class_weight='balanced', random_state=42)

classifier_lr.fit(X_train_tfidf, y_train)
predictions = classifier_lr.predict(X_test_tfidf)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.88      1.00      0.94        89
           1       1.00      0.55      0.71        20
           2       1.00      0.92      0.96        12
           3       1.00      0.67      0.80         6
           4       0.96      1.00      0.98        23
           5       1.00      0.97      0.99        34
           6       1.00      1.00      1.00         4

    accuracy                           0.93       188
   macro avg       0.98      0.87      0.91       188
weighted avg       0.94      0.93      0.92       188



In [37]:
classifier_svm = SVC(class_weight='balanced', random_state=42)
classifier_svm.fit(X_train_tfidf, y_train)
predictions = classifier_svm.predict(X_test_tfidf)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99        89
           1       0.83      1.00      0.91        20
           2       1.00      0.92      0.96        12
           3       1.00      0.67      0.80         6
           4       0.96      1.00      0.98        23
           5       1.00      0.97      0.99        34
           6       1.00      1.00      1.00         4

    accuracy                           0.97       188
   macro avg       0.97      0.93      0.95       188
weighted avg       0.98      0.97      0.97       188



In [38]:
classifier_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300,random_state=42)
classifier_mlp.fit(X_train_tfidf, y_train)
predictions = classifier_mlp.predict(X_test_tfidf)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99        89
           1       0.83      1.00      0.91        20
           2       1.00      0.92      0.96        12
           3       1.00      0.67      0.80         6
           4       0.96      1.00      0.98        23
           5       1.00      0.97      0.99        34
           6       1.00      1.00      1.00         4

    accuracy                           0.97       188
   macro avg       0.97      0.93      0.95       188
weighted avg       0.98      0.97      0.97       188



In [41]:
classifier_rf = RandomForestClassifier(class_weight='balanced',random_state=42)
classifier_rf.fit(X_train_tfidf, y_train)
predictions = classifier_rf.predict(X_test_tfidf)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        89
           1       0.83      1.00      0.91        20
           2       1.00      0.92      0.96        12
           3       1.00      0.67      0.80         6
           4       0.96      1.00      0.98        23
           5       1.00      0.97      0.99        34
           6       1.00      1.00      1.00         4

    accuracy                           0.97       188
   macro avg       0.97      0.93      0.95       188
weighted avg       0.98      0.97      0.97       188



In [None]:
#NEXT STEPS ?? 
# Relationships model (What will be the input of it ?)

# Use BOW instead of TFIDF in classification model? 
# Data Augmentation (I guess it's important as the dataset is small)