In [46]:
from gensim.parsing.preprocessing import preprocess_documents, remove_stopwords, remove_short_tokens
from gensim.parsing.preprocessing import strip_punctuation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
from datetime import datetime
import pandas as pd

In [47]:
df = pd.read_csv('./data/wine_reviews.csv')

In [48]:
df['description'][10]

'Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.'

In [49]:
df['description_clean'] = df['description'].apply(lambda x: x.lower())
df['description_clean'] = df['description_clean'].apply(lambda x: remove_stopwords(x))
df['description_clean'] = df['description_clean'].apply(lambda x: strip_punctuation(x))
df['description_clean'] = df['description_clean'].apply(lambda x: word_tokenize(x))
df['description_clean'] = df['description_clean'].apply(lambda x: [wnl.lemmatize(i) for i in x])
df['description_clean'] = df['description_clean'].apply(lambda x: remove_short_tokens(x))

In [51]:
df['description_clean'][10]

['soft',
 'supple',
 'plum',
 'envelope',
 'oaky',
 'structure',
 'cabernet',
 'supported',
 'merlot',
 'coffee',
 'chocolate',
 'complete',
 'picture',
 'finishing',
 'strong',
 'end',
 'resulting',
 'value',
 'priced',
 'wine',
 'attractive',
 'flavor',
 'immediate',
 'accessibility']

In [52]:
splitted_texts = df['description_clean'].tolist()
#splitted_texts = text
# preprocess_documents(text))

In [53]:
idx = [str(i) for i in range(len(splitted_texts))]

tagged_documents = []
for i in range(len(splitted_texts)):
    tagged_documents.append(TaggedDocument(splitted_texts[i], [idx[i]]))

In [54]:
tagged_documents[10]

TaggedDocument(words=['soft', 'supple', 'plum', 'envelope', 'oaky', 'structure', 'cabernet', 'supported', 'merlot', 'coffee', 'chocolate', 'complete', 'picture', 'finishing', 'strong', 'end', 'resulting', 'value', 'priced', 'wine', 'attractive', 'flavor', 'immediate', 'accessibility'], tags=['10'])

In [55]:
#model = Doc2Vec(tagged_documents, vector_size=100, window=5, min_count=20, workers=-1)
#model.build_vocab(tagged_documents)

def build_model(max_epochs, vec_size, alpha, tagged_documents):
    model = Doc2Vec(vector_size=vec_size,
                    alpha=alpha,
                    min_alpha=0.00025,
                    min_count=1,
                    dm=1)

    model.build_vocab(tagged_documents)

    for epoch in range(max_epochs):
        print(f"Iteration {epoch}")
        model.train(tagged_documents,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)

        model.alpha -= 0.0002
        model.min_alpha = model.alpha
        
    
    return model

In [56]:
model = build_model(max_epochs=1, vec_size=50, alpha=0.025, tagged_documents=tagged_documents)

Iteration 0


In [57]:
model.save('./models/' + (datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S')) + '_doc2vec.model')

In [58]:
# Create new sentence and vectorize it. 
new_sentence = "Attractively ripe, this has fruity red-berry flavors along with acidity and soft tannins. This is ready to drink now."
original = new_sentence
#new_sentence_vectorized = model.infer_vector(new_sentence)

In [59]:
new_sentence = new_sentence.lower()
new_sentence = remove_stopwords(new_sentence)
new_sentence = strip_punctuation(new_sentence)
new_sentence = word_tokenize(new_sentence)
new_sentence = [wnl.lemmatize(i) for i in new_sentence]
new_sentence = remove_short_tokens(new_sentence)
new_sentence_vectorized = model.infer_vector(new_sentence)
new_sentence_vectorized

array([ 0.17593654,  0.06928904, -0.0407874 , -0.02585632,  0.09299903,
        0.08303221, -0.02629865, -0.06440509,  0.22314161, -0.04408715,
       -0.06187092,  0.16877143, -0.01137235,  0.06033728,  0.24866566,
       -0.11874454, -0.14759423,  0.21945474, -0.09404378, -0.09669308,
       -0.03852752, -0.03735104, -0.3406797 ,  0.23288187, -0.0580943 ,
       -0.24986836,  0.10389578, -0.4059953 , -0.04941511, -0.14741573,
        0.22339599,  0.03376621, -0.01145129,  0.07112709, -0.13794322,
       -0.25795802, -0.04731727,  0.25738806,  0.01081435, -0.04122727,
        0.21968582, -0.0420948 , -0.07491214, -0.04132482, -0.17815545,
       -0.28923523, -0.03412611,  0.00053623,  0.13627923,  0.00887265],
      dtype=float32)

In [60]:
# Calculate cosine similarity. 
similar_sentences = model.dv.most_similar(positive=[new_sentence_vectorized])

In [61]:
similar_sentences

[('69488', 0.8108602166175842),
 ('19548', 0.7896836996078491),
 ('12068', 0.7812333703041077),
 ('27404', 0.7793558239936829),
 ('109314', 0.7768636345863342),
 ('68494', 0.7756578326225281),
 ('26807', 0.7743197679519653),
 ('32230', 0.7661463618278503),
 ('45901', 0.7648536562919617),
 ('109057', 0.7637451887130737)]

In [62]:
print('Original: ', original)
print('Simular:', df['description'][24997])

Original:  Attractively ripe, this has fruity red-berry flavors along with acidity and soft tannins. This is ready to drink now.
Simular: This wine is all bright cherry fruit with juicy acidity and a jammy character. It's delicious now, with its light tannins and bright raspberry aftertaste. Drink now.


In [63]:
model.wv.similar_by_word('berry')

[('blackberry', 0.803281843662262),
 ('raspberry', 0.7911970615386963),
 ('strawberry', 0.7140820622444153),
 ('cherry', 0.6910226941108704),
 ('blueberry', 0.6895474791526794),
 ('licorice', 0.6871267557144165),
 ('black', 0.6633455753326416),
 ('plum', 0.6577023863792419),
 ('chewy', 0.6408059597015381),
 ('cranberry', 0.6339317560195923)]

In [64]:
model.wv.similar_by_word('fruity')

[('attractive', 0.7684898376464844),
 ('fruitiness', 0.7665814757347107),
 ('ripe', 0.7274746894836426),
 ('rounded', 0.7236625552177429),
 ('juicy', 0.7044587135314941),
 ('perfumed', 0.6998727321624756),
 ('flavored', 0.6990644931793213),
 ('fruit', 0.6893908977508545),
 ('generous', 0.6699721217155457),
 ('crisp', 0.6698552966117859)]

In [65]:
model.wv.similar_by_word('acidity')

[('acid', 0.7812641859054565),
 ('flavor', 0.6918242573738098),
 ('fruitiness', 0.6862507462501526),
 ('richness', 0.6733599305152893),
 ('tangy', 0.6718690991401672),
 ('character', 0.6660929322242737),
 ('juicy', 0.6458977460861206),
 ('aftertaste', 0.6450532078742981),
 ('minerality', 0.6412492394447327),
 ('ripe', 0.6358282566070557)]

In [66]:
model.wv.similar_by_word('delicious')

[('rich', 0.7972066402435303),
 ('ripe', 0.7590669393539429),
 ('great', 0.7228766083717346),
 ('attractive', 0.7223926782608032),
 ('ready', 0.7017973065376282),
 ('fine', 0.6962223052978516),
 ('complex', 0.69059157371521),
 ('luscious', 0.6774658560752869),
 ('juicy', 0.6754449009895325),
 ('generous', 0.6717314124107361)]