In [1]:
import os
from argparse import Namespace
import copy
import gensim
from gensim.models import Word2Vec
import json
import nltk; nltk.download('punkt')
import numpy as np
import pandas as pd
import re
import urllib

[nltk_data] Downloading package punkt to /Users/jiachenx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
args = Namespace(
    seed=1234,
    data_file="restautant_review_corpus.csv",
    embedding_dim=100,
    window=3,
    min_count=2,
    skip_gram=0,
    negative_sampling=20,
)

In [3]:
corpus_df = pd.read_csv(args.data_file)

corpus_df.drop(index=np.where(pd.isnull(corpus_df))[0], axis=0, inplace=True)

train_data_list = corpus_df['reviewContent'].to_list()
print (len(train_data_list))
print (train_data_list[11])
print(type(train_data_list))

26956
Today marks one year since my dinner at Alinea. It was the finest dining experience of my life and it lived up to all of my lofty expectations. In hindsight, it is not the culinary party tricks or the use of technology that still lingers in the forefront of my mind (although each innovative technique employed delighted me and served to advance each dish). My memories fall back on the bright, harmonious flavors and the impeccable (at at times, playful) service. To borrow from Michelin's own rubric, Alinea provided me "exceptional cuisine, worth a special journey." So much so that I I cannot imagine another trip to Chicago without partaking of what ever joys that Achatz may have in store.
<class 'list'>


In [4]:
# Preprocessing
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    text = text.strip()
    return text

In [5]:
# Clean sentences
sentences = [preprocess_text(sentence) for sentence in train_data_list]
print (sentences[11])

today marks one year since my dinner at alinea . it was the finest dining experience of my life and it lived up to all of my lofty expectations . in hindsight , it is not the culinary party tricks or the use of technology that still lingers in the forefront of my mind although each innovative technique employed delighted me and served to advance each dish . my memories fall back on the bright , harmonious flavors and the impeccable at at times , playful service . to borrow from michelin s own rubric , alinea provided me exceptional cuisine , worth a special journey . so much so that i i cannot imagine another trip to chicago without partaking of what ever joys that achatz may have in store .


In [6]:
# Process sentences for gensim
sentences = [sentence.split(" ") for sentence in sentences]
print (sentences[11])

['today', 'marks', 'one', 'year', 'since', 'my', 'dinner', 'at', 'alinea', '.', 'it', 'was', 'the', 'finest', 'dining', 'experience', 'of', 'my', 'life', 'and', 'it', 'lived', 'up', 'to', 'all', 'of', 'my', 'lofty', 'expectations', '.', 'in', 'hindsight', ',', 'it', 'is', 'not', 'the', 'culinary', 'party', 'tricks', 'or', 'the', 'use', 'of', 'technology', 'that', 'still', 'lingers', 'in', 'the', 'forefront', 'of', 'my', 'mind', 'although', 'each', 'innovative', 'technique', 'employed', 'delighted', 'me', 'and', 'served', 'to', 'advance', 'each', 'dish', '.', 'my', 'memories', 'fall', 'back', 'on', 'the', 'bright', ',', 'harmonious', 'flavors', 'and', 'the', 'impeccable', 'at', 'at', 'times', ',', 'playful', 'service', '.', 'to', 'borrow', 'from', 'michelin', 's', 'own', 'rubric', ',', 'alinea', 'provided', 'me', 'exceptional', 'cuisine', ',', 'worth', 'a', 'special', 'journey', '.', 'so', 'much', 'so', 'that', 'i', 'i', 'cannot', 'imagine', 'another', 'trip', 'to', 'chicago', 'without'

In [7]:
(sentences[9000])

['it',
 'doesnt',
 'get',
 'much',
 'nicer',
 'then',
 'this',
 ',',
 'i',
 'hope',
 'i',
 'can',
 'actually',
 'afford',
 'to',
 'shop',
 'there',
 'at',
 'some',
 'point',
 'in',
 'my',
 'life',
 ',',
 'they',
 'have',
 'an',
 'amazing',
 'selection',
 '.',
 'the',
 'prepared',
 'food',
 'from',
 'the',
 'counter',
 'is',
 'of',
 'the',
 'highest',
 'quality',
 ',',
 'and',
 'the',
 'cafe',
 'is',
 'damn',
 'good',
 'too',
 '.',
 '.',
 '.',
 'they',
 'even',
 'carry',
 'kobe',
 'beef',
 ',',
 'ive',
 'never',
 'even',
 'seen',
 'it',
 'in',
 'person',
 'before',
 'and',
 'i',
 'am',
 'in',
 'culinary',
 'school',
 '!',
 '!',
 '!']

In [8]:
# Train Word2Vec model with sentences
model = Word2Vec(sentences=sentences, size=args.embedding_dim, 
                 window=args.window, min_count=args.min_count, 
                 sg=args.skip_gram, negative=args.negative_sampling)
print (model)

Word2Vec(vocab=22523, size=100, alpha=0.025)


In [9]:
model.wv.most_similar(positive="have", topn=5)

[('ve', 0.7855691909790039),
 ('ive', 0.6333054304122925),
 ('havent', 0.5213067531585693),
 ('haven', 0.5129066705703735),
 ('having', 0.4656580686569214)]

In [10]:
model.save('plp.w2v.model')