#  <b> Doc2Vec - Hotel Reviews
   

### <b> Setup 

In [49]:
import pandas as pd

import gensim 
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### <b> Data Loading


In [19]:
data = pd.read_csv('../datasets/hotel-reviews/hotel_reviews.csv')

### <b> Exploratory Data Analysis

In [20]:
data.sample(2)

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,primaryCategories,city,country,keys,latitude,...,reviews.dateSeen,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sourceURLs,websites
8927,AVwdmGkmIN2L1WUfxXj-,2017-04-18T11:03:16Z,2018-12-28T06:33:37Z,330 Magazine St,"Hotels,Lodging,Hotel",Accommodation & Food Services,New Orleans,US,us/la/neworleans/330magazinest/856161073,29.94983,...,2018-08-26T00:00:00Z,3,https://www.tripadvisor.com/Hotel_Review-g6086...,Booked with a third party service-checking in ...,Average service great location,Dallas,Texas,strucks,https://www.tripadvisor.com/Hotel_Review-g6086...,https://www.choicehotels.com/louisiana/new-orl...
9191,AVz1iS0Q-gnIPe8DUYOb,2017-06-29T20:27:06Z,2018-12-21T05:55:11Z,800 Fairview Ave N,"Hotels,Lodging,Hotel",Accommodation & Food Services,Seattle,US,us/wa/seattle/800fairviewaven/530809159,47.626358,...,2018-12-14T00:00:00Z,3,https://www.tripadvisor.com/Hotel_Review-g6087...,"Hotel is ok, rooms are nice and big. They have...",Beware Priceline People,Vancouver,Canada,Rich984,https://www.tripadvisor.com/Hotel_Review-g6087...,http://www.marriott.com/hotels/travel/sealu-re...


In [21]:
data.shape

(10000, 26)

In [22]:
data['reviews.text'].head()

0    This hotel was nice and quiet. Did not know, t...
1    We stayed in the king suite with the separatio...
2    Parking was horrible, somebody ran into my ren...
3    Not cheap but excellent location. Price is som...
4    If you get the room that they advertised on th...
Name: reviews.text, dtype: object

### <b> Data Processing

### <b> Doc2Vec

In [36]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sentences)]

In [37]:
tagged_data[0]

TaggedDocument(words=['currently', 'in', 'bed', 'writing', 'this', 'for', 'the', 'past', 'hr', '1/2', 'there', 'have', 'been', 'dogs', 'barking', 'and', 'squealing', 'call', 'the', 'front', 'desk', 'to', 'advise', 'basically', 'to', 'be', 'told', 'there', "'s", 'nothing', 'they', 'can', 'do', '.', '315.00', 'and', 'i', 'ca', "n't", 'sleep', '.'], tags=['0'])

In [40]:
max_epochs = 10
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

In [42]:
for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

  epochs=model.iter)


Model Saved


In [43]:
model = Doc2Vec.load("d2v.model")

In [76]:
test_data = word_tokenize("currently bed writing past hr dogs barking squealing call front desk advise basically told nothing ca sleep ".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

V1_infer [ 0.02836782 -0.55253285  0.39682478 -0.4584763  -0.05888359  0.07548706
  0.22736174 -0.13753387  0.38355336 -0.37164566 -0.12517379  0.3828148
 -0.24724478  0.25117028 -0.03419457 -0.29544836 -0.22575805  0.03547256
 -0.21079966  0.10363379]


In [77]:
# find most similar doc 
test_doc = word_tokenize("currently bed writing past hr dogs barking squealing call front desk advise basically told nothing ca sleep".lower())
model.docvecs.most_similar(positive=[model.infer_vector(test_doc)], topn = 5)

[('751', 0.8144057393074036),
 ('1818', 0.8105208277702332),
 ('886', 0.7998476028442383),
 ('1901', 0.795726478099823),
 ('624', 0.7893334031105042)]

In [87]:
' '.join(test_doc)

'currently bed writing past hr dogs barking squealing call front desk advise basically told nothing ca sleep'

In [90]:
' '.join(tagged_data[624].words)



### <b> Results 

#### <b> Visualizing Embeddings

### <b> Post Processing 

Words frequency by classification, TSNE etc...

### <b> Conclusion