In [23]:
#import necessary packages
import pandas as pd
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec

# Read your CSV file into a pandas DataFrame
df = pd.read_csv('Arriba Mexican Grill.csv')

In [24]:
# Download NLTK resources for preprocessing
nltk.download('punkt')
nltk.download('stopwords')

# Define a tokenizer
tokenizer = WordPunctTokenizer()

# Define a function to clean and tokenize text
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word.lower() for word in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Clean and tokenize the review text
df['tokenized_review'] = df['review'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
# Build a Word2Vec model
w2v_model = Word2Vec(df['tokenized_review'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Find word similarities to quesadilla
print(w2v_model.wv.most_similar('quesadilla', topn=5))

[('ask', 0.9771250486373901), ('always', 0.9764689207077026), ('meal', 0.9759775996208191), ('big', 0.975333034992218), ('arriba', 0.9752405881881714)]


Given the lack of context "'ask','always', and 'arriba'" don't have a clear connection. However "big" and "meal" are informative.  This tells me that despite being listed as an appetizer on the menu, the quesadilla is big and can serve as a meal.

In [26]:
# Find word similarities to taco
print(w2v_model.wv.most_similar('taco', topn=5))

[('food', 0.9956725835800171), ('would', 0.9956137537956238), ('like', 0.9954841136932373), ('got', 0.9953912496566772), ('us', 0.9953407049179077)]


Considering Taco is a food that makes sense. However, "'would','like','got','us'" don't have a clear connection

In [27]:
# Find word similarities to burrito
print(w2v_model.wv.most_similar('burrito', topn=5))

[('time', 0.9871467351913452), ('us', 0.9870790839195251), ('plate', 0.9870587587356567), ('make', 0.9868842363357544), ('take', 0.9868794083595276)]


The only word that has a direct connection to burrito is plate as that is an official menu item. I'm curious in what context time is associated, whether that means burritos take a while or come quickly.

In [28]:
# Save the model
w2v_model.save('Arriba-w2v.bin')

# Load the saved model
saved_model = gensim.models.Word2Vec.load('Arriba-w2v.bin')