In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
import pickle

In [3]:
from transformers import BertTokenizer, BertModel
import torch

### Drinks: 1
### Appetizers: 2
### Salads: 3
### Soups: 4
### Main: 5
### Desserts: 6

In [4]:
food_list_df = pd.read_csv('food_list_backup.csv').dropna().reset_index(drop = True)
food_list_df.Type = food_list_df.Type.astype('int32')
food_list_extra_df = pd.read_csv('food_list_extra.csv').rename(columns={"text": "Text", "category": "Type"})
food_list_tot_df = pd.concat([food_list_df, food_list_extra_df], axis=0).reset_index(drop = True)
food_list_tot_df.to_csv('food_list_tot.csv')

In [7]:
food_list_tot_df

Unnamed: 0,Text,Type
0,Soft Tofu Soup,4
1,Noodle Soups,4
2,Rice Noodle Soup,4
3,Bamee Wonton,4
4,Mussels di Napeli,5
5,Captese Flatbread,5
6,Grilled Chiken Flatbread,5
7,Bruschetta,5
8,Ultimate Feast,5
9,Mixed Grill,5


## BERT Conversion

In [8]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def texts_to_vectors(texts):
    print("Starting texts_to_vectors function...")
    
    vectors = []
    for cnt, text in enumerate(texts):
        try:            
            inputs = tokenizer([text], return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state
            mask = inputs.attention_mask
            masked_embeddings = embeddings * mask.unsqueeze(-1)
            summed = torch.sum(masked_embeddings, 1)
            summed_mask = torch.clamp(mask.sum(1), min=1e-9)
            mean_pooled = summed / summed_mask.unsqueeze(-1)
            vectors.append(mean_pooled[0].numpy())
        except Exception as e:
            print(f"Error encountered while processing text {cnt}: {e}")
            continue
        # Print progress update every 400 samples
        if cnt % 400 == 0:
            percentage = (cnt / len(texts)) * 100
            print(f"Processing text {cnt} of {len(texts)} ({percentage:.2f}% complete)")
    
    print("Finished converting texts to vectors.")
    return np.array(vectors)

vectors = texts_to_vectors(food_list_tot_df['Text'].tolist())
df_vectors = pd.DataFrame(vectors, columns=[f'vector_{i}' for i in range(vectors.shape[1])])
df_bert_test = pd.concat([food_list_tot_df, df_vectors], axis=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Starting texts_to_vectors function...
Processing text 0 of 737 (0.00% complete)
Processing text 400 of 737 (54.27% complete)
Finished converting texts to vectors.


In [12]:
food_list_tot_df.shape

(737, 2)

In [11]:
# Create a new column 'salad' that has a value of 1 if 'text' contains the word 'salad', and 0 otherwise
words = ['salad','soup','chowder', 'appetizer', 'fries','strip','bowl', 'chips', 'steak', 'platter', 'pudding',\
         'chocolate','malt', 'shake','cream','creme','vanilla','brownie', 'pie', 'rings', 'wrap',\
         'juice', 'coffee', 'milk', 'tea', 'bites','drink','orange','water','burger','meat','nacho','sandwich',\
        'patty','tater','burrito','skillet','lattte','esspresso', 'cafe','sausage', 'ice cream','beer','wine']

for word in words:
    df_bert_test[word] = df_bert_test['Text'].str.contains(word, case=False).astype(int)

words2 = words.copy()
words2.extend(['Text','Type'])
df_bert_test[words2]

Unnamed: 0,salad,soup,chowder,appetizer,fries,strip,bowl,chips,steak,platter,pudding,chocolate,malt,shake,cream,creme,vanilla,brownie,pie,rings,wrap,juice,coffee,milk,tea,bites,drink,orange,water,burger,meat,nacho,sandwich,patty,tater,burrito,skillet,lattte,esspresso,cafe,sausage,ice cream,beer,wine,Text,Type
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Soft Tofu Soup,4
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Noodle Soups,4
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Rice Noodle Soup,4
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Bamee Wonton,4
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Mussels di Napeli,5
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Captese Flatbread,5
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Grilled Chiken Flatbread,5
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Bruschetta,5
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Ultimate Feast,5
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Mixed Grill,5


## Train Test Split & XGBoost

In [13]:
X = df_bert_test.drop(['Text','Type'],axis = 1)
y = df_bert_test[['Type']]

test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=0)

xgb = XGBClassifier(n_estimators= 150,learning_rate = 0.3,random_state=0)
xgb.fit(X_train, y_train)
score = xgb.score(X_test, y_test)
y_hat_tree = xgb.predict(X_test)

print('XGBoost Score: ', score)

XGBoost Score:  0.7297297297297297


In [14]:
with open('/Users/alex/Data_Science/Menu_App/Web_App/xgb2.pkl', 'wb') as f:
    pickle.dump(xgb, f)

In [24]:
xgb = XGBClassifier(random_state=0)


param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 4, 6, 8],
    'learning_rate': [0.01,0.05, 0.1, 0.2]
}

grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy')

# Fit the RandomizedSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


KeyboardInterrupt: 

In [88]:
rfc = RandomForestClassifier(bootstrap=True, n_estimators=600,random_state=0)
rfc.fit(X_train, y_train)
score = rfc.score(X_test, y_test)
y_hat_tree = rfc.predict(X_test)

print('Score: ', score)

  rfc.fit(X_train, y_train)


Score:  0.6024844720496895


In [10]:
items = ['COFFEE', 'CAFE LATTE', 'BLOOD ORANGE', 'MIMOSA', 'ESPRESSO', 'MOCHA', 'BUTTER FRENCH TOAST', 'BRAISED OXTAIL TAQUITOS HUEVOS', 'PORCHETTA EGGS BENEDICT', 'BREAKFAST SANDWICH', 'EGG WHITE OMELETTE', 'RIBEYE', 'HASH SKILLET', 'HAND ROLLED CINNAMON ROLL', 'BAGEL', 'BREAKFAST POTATOES', 'APPLEWOOD SMOKED BACON', 'FRUIT', 'BUTCHER LARDER', 'YOGURT PARFAIT', 'SAUSAGE']
items4 = ['STARTER','GOLDEN BEETS','CHICKEN FRIED CHICKEN','MUSHROOM ENCHILADAS','CAJETA ICE CREAM']
items2 = ['Greek Salad S', 'Aubrees Wings SO', 'Caesar Salad', 'Bacon Brussels Sprouts SO', 'Garden Salad S', 'AUBREES SIGNATURE PIZZAS', 'BBQ Chicken', 'The Ultimate Feta Bread', 'Mac  Cheese', 'Cheese Bread S', 'Pasta Gone Bayou', 'Aubrees Bread', 'MARGHERITA', 'Spaghetti  Meatball', 'Lasagna Rolls', 'Aubrees Chili quart SOGF', 'Tomato Bisque quart']

In [30]:
test = pd.DataFrame({'Text':items2, 'Type': np.zeros(len(items2),dtype = int).tolist()})

vectors = texts_to_vectors(test['Text'].tolist())
df_vectors = pd.DataFrame(vectors, columns=[f'vector_{i}' for i in range(vectors.shape[1])])
df_bert_test2 = pd.concat([test, df_vectors], axis=1)

for word in words:
    df_bert_test2[word] = test['Text'].str.contains(word, case=False).astype(int)

df_predict = df_bert_test2.drop(['Text','Type'],axis = 1)
Type = xgb.predict(df_predict)
df_bert_test2['Type'] = Type
df_bert_test2[['Text','Type']]

Starting texts_to_vectors function...
Processing text 0 of 17 (0.00% complete)
Finished converting texts to vectors.


Unnamed: 0,Text,Type
0,Greek Salad S,3
1,Aubrees Wings SO,5
2,Caesar Salad,3
3,Bacon Brussels Sprouts SO,5
4,Garden Salad S,3
5,AUBREES SIGNATURE PIZZAS,5
6,BBQ Chicken,5
7,The Ultimate Feta Bread,5
8,Mac Cheese,5
9,Cheese Bread S,5


In [242]:
temp3 = []
for element in df_bert_test:
    if element not in df_predict:
        temp3.append(element)
 
print(temp3)

['Text', 'Type']
