In [19]:
!pip install gensim nltk pandas



In [20]:
# Import gensim modules.
#from gensim.test.utils import common_texts
from gensim.parsing.preprocessing import preprocess_documents, remove_stopwords, remove_short_tokens
from gensim.parsing.preprocessing import strip_punctuation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
from datetime import datetime
import pandas as pd

In [21]:
df = pd.read_csv('./data/wine_reviews.csv')

In [22]:
df['description'][10]

"Elegance, complexity and structure come together in this drop-dead gorgeous winethat ranks among Italy's greatest whites. It opens with sublime yellow spring flower, aromatic herb and orchard fruit scents. The creamy, delicious palate seamlessly combines juicy white peach, ripe pear and citrus flavors while white almond and savory mineral notes grace the lingering finish."

In [23]:
df['description_clean'] = df['description'].apply(lambda x: x.lower())
df['description_clean'] = df['description_clean'].apply(lambda x: remove_stopwords(x))
df['description_clean'] = df['description_clean'].apply(lambda x: strip_punctuation(x))
df['description_clean'] = df['description_clean'].apply(lambda x: word_tokenize(x))
df['description_clean'] = df['description_clean'].apply(lambda x: [wnl.lemmatize(i) for i in x])
df['description_clean'] = df['description_clean'].apply(lambda x: remove_short_tokens(x))

In [24]:
wnl = WordNetLemmatizer()
a = 'flavors in my gardens'
a = word_tokenize(a)
[wnl.lemmatize(i) for i in a]

['flavor', 'in', 'my', 'garden']

In [25]:
df['description_clean'][10]

['elegance',
 'complexity',
 'structure',
 'come',
 'drop',
 'dead',
 'gorgeous',
 'winethat',
 'rank',
 'italy',
 'greatest',
 'white',
 'open',
 'sublime',
 'yellow',
 'spring',
 'flower',
 'aromatic',
 'herb',
 'orchard',
 'fruit',
 'scent',
 'creamy',
 'delicious',
 'palate',
 'seamlessly',
 'combine',
 'juicy',
 'white',
 'peach',
 'ripe',
 'pear',
 'citrus',
 'flavor',
 'white',
 'almond',
 'savory',
 'mineral',
 'note',
 'grace',
 'lingering',
 'finish']

In [26]:
splitted_texts = df['description_clean'].tolist()
#splitted_texts = text
# preprocess_documents(text))

In [27]:
splitted_texts

[['tremendous',
  '100',
  'varietal',
  'wine',
  'hail',
  'oakville',
  'aged',
  'year',
  'oak',
  'juicy',
  'red',
  'cherry',
  'fruit',
  'compelling',
  'hint',
  'caramel',
  'greet',
  'palate',
  'framed',
  'elegant',
  'fine',
  'tannin',
  'subtle',
  'minty',
  'tone',
  'background',
  'balanced',
  'rewarding',
  'start',
  'finish',
  'year',
  'ahead',
  'develop',
  'nuance',
  'enjoy',
  '2022–2030'],
 ['ripe',
  'aroma',
  'fig',
  'blackberry',
  'cassis',
  'softened',
  'sweetened',
  'slathering',
  'oaky',
  'chocolate',
  'vanilla',
  'full',
  'layered',
  'intense',
  'cushioned',
  'palate',
  'rich',
  'flavor',
  'chocolaty',
  'black',
  'fruit',
  'baking',
  'spice',
  'toasty',
  'everlasting',
  'finish',
  'heady',
  'ideally',
  'balanced',
  'drink',
  '2023'],
 ['mac',
  'watson',
  'honor',
  'memory',
  'wine',
  'mother',
  'tremendously',
  'delicious',
  'balanced',
  'complex',
  'botrytised',
  'white',
  'dark',
  'gold',
  'color',
 

In [28]:
idx = [str(i) for i in range(len(splitted_texts))]

tagged_documents = []
for i in range(len(splitted_texts)):
    tagged_documents.append(TaggedDocument(splitted_texts[i], [idx[i]]))

In [29]:
tagged_documents

[TaggedDocument(words=['tremendous', '100', 'varietal', 'wine', 'hail', 'oakville', 'aged', 'year', 'oak', 'juicy', 'red', 'cherry', 'fruit', 'compelling', 'hint', 'caramel', 'greet', 'palate', 'framed', 'elegant', 'fine', 'tannin', 'subtle', 'minty', 'tone', 'background', 'balanced', 'rewarding', 'start', 'finish', 'year', 'ahead', 'develop', 'nuance', 'enjoy', '2022–2030'], tags=['0']),
 TaggedDocument(words=['ripe', 'aroma', 'fig', 'blackberry', 'cassis', 'softened', 'sweetened', 'slathering', 'oaky', 'chocolate', 'vanilla', 'full', 'layered', 'intense', 'cushioned', 'palate', 'rich', 'flavor', 'chocolaty', 'black', 'fruit', 'baking', 'spice', 'toasty', 'everlasting', 'finish', 'heady', 'ideally', 'balanced', 'drink', '2023'], tags=['1']),
 TaggedDocument(words=['mac', 'watson', 'honor', 'memory', 'wine', 'mother', 'tremendously', 'delicious', 'balanced', 'complex', 'botrytised', 'white', 'dark', 'gold', 'color', 'layer', 'toasted', 'hazelnut', 'pear', 'compote', 'orange', 'peel', '

In [30]:
#model = Doc2Vec(tagged_documents, vector_size=100, window=5, min_count=20, workers=-1)
#model.build_vocab(tagged_documents)

def build_model(max_epochs, vec_size, alpha, tagged_documents):
    model = Doc2Vec(vector_size=vec_size,
                    alpha=alpha,
                    min_alpha=0.00025,
                    min_count=1,
                    dm=1)

    model.build_vocab(tagged_documents)

    for epoch in range(max_epochs):
        print(f"Iteration {epoch}")
        model.train(tagged_documents,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)

        model.alpha -= 0.0002
        model.min_alpha = model.alpha
        
    if epoch % 5 == 0:
        model.save('./models/' + (datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S')) + '_doc2vec.model')
    # Now simply save the model to avoid training again.
        print("Model Saved")
    return model

In [39]:
model.save('./models/' + (datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S')) + '_doc2vec.model')

In [31]:
model = build_model(max_epochs=100, vec_size=50, alpha=0.025, tagged_documents=tagged_documents)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration

In [1]:
# Create new sentence and vectorize it. 
new_sentence = "Attractively ripe, this has fruity red-berry flavors along with acidity and soft tannins. This is ready to drink now."
original = new_sentence
#new_sentence_vectorized = model.infer_vector(new_sentence)

In [76]:
new_sentence = new_sentence.lower()
new_sentence = remove_stopwords(new_sentence)
new_sentence = strip_punctuation(new_sentence)
new_sentence = word_tokenize(new_sentence)
new_sentence = [wnl.lemmatize(i) for i in new_sentence]
new_sentence = remove_short_tokens(new_sentence)
new_sentence_vectorized = model.infer_vector(new_sentence)
new_sentence_vectorized

array([ 0.42708707,  0.13602328, -0.00842261, -0.2507622 , -0.19110666,
        0.26104787,  0.20333648, -0.5840064 ,  0.5691965 , -0.0132357 ,
       -0.24658507,  0.27887246, -0.72025955,  0.06414805,  0.36528012,
       -0.01352325, -0.25845343,  0.08424725,  0.2627285 ,  0.39538583,
       -0.2484346 , -0.1523786 , -0.51817626,  0.07394347,  0.09403346,
        0.14922005,  0.20015298,  0.02082809,  0.23296195, -0.43294355,
        0.14511514, -0.12593505, -0.14489311, -0.01729424,  0.2973799 ,
       -0.23094133, -0.11227418,  0.26154986,  0.20294203,  0.14796463,
        0.09815811, -0.44417167,  0.23393953,  0.41998214, -0.74942917,
        0.12676084,  0.08275799,  0.628936  , -0.0729911 , -0.06741878],
      dtype=float32)

In [77]:

# Calculate cosine similarity. 
similar_sentences = model.dv.most_similar(positive=[new_sentence_vectorized])

In [78]:
similar_sentences

[('143335', 0.7215730547904968),
 ('71005', 0.7109041213989258),
 ('85698', 0.687388002872467),
 ('13683', 0.6609319448471069),
 ('33682', 0.6599031090736389),
 ('43473', 0.6563486456871033),
 ('87363', 0.6527281403541565),
 ('19437', 0.6328625082969666),
 ('48442', 0.6312682628631592),
 ('78267', 0.630492091178894)]

In [82]:
print('Original: ', original)
print('Simular:', df['description'][13683])

Original:  Attractively ripe, this has fruity red-berry flavors along with acidity and soft tannins. This is ready to drink now.
Simular: A soft, gentle medium-weight wine with attractive acidity, fresh red berry fruits and light tannins. Ready to drink.


In [64]:
model.wv.similar_by_word('wine')

[('flavor', 0.7637519240379333),
 ('palate', 0.7583109736442566),
 ('aroma', 0.7338285446166992),
 ('acidity', 0.7063391804695129),
 ('finish', 0.699916422367096),
 ('fruit', 0.696194589138031),
 ('nose', 0.675162672996521),
 ('producer', 0.6573511362075806),
 ('cherry', 0.6126402616500854),
 ('viognier', 0.610636830329895)]

In [38]:
# Output
output = []
for i, v in enumerate(similar_sentences):
    index = v[0]
    output.append([i + 1, df['description'][index], df['designation'][index], v[1]])

KeyError: '118692'

In [None]:
pd.DataFrame(output, columns=["rank", "description", "designation", "cosine_similarity"])

In [None]:
df['description'][118274]

In [None]:
model.wv.similar_by_word('price')