# Word2Vec - własny model do klasyfikacji opinii

In [1]:
import gzip
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec, LdaMulticore

from gensim.corpora import Dictionary
import numpy as np
import pyLDAvis
from pyLDAvis import gensim

  return f(*args, **kwds)


In [2]:
ls -lh data

total 83M
-rwxr--r-- 1 cherit users 83M Sep  3 10:30 [0m[01;32mreviews_data.txt.gz[0m*


In [10]:
def preprocess_sentence(sentence):
    return [word for word in simple_preprocess(sentence) if word not in STOPWORDS]

def read_sentences(filename):
    with gzip.open(filename, 'rb') as f:
        for line in f:
            yield preprocess_sentence(line)
sentences = read_sentences('data/reviews_data.txt.gz')

In [11]:
%time sentences = list(read_sentences('data/reviews_data.txt.gz'))

CPU times: user 44.2 s, sys: 655 ms, total: 44.9 s
Wall time: 45.1 s


In [12]:
len(sentences)

255404

# Model

In [14]:
%time model = Word2Vec(sentences, size=100, window=5, min_count=2)

CPU times: user 3min 9s, sys: 616 ms, total: 3min 9s
Wall time: 1min 6s


In [None]:
model = Word2Vec.load('models/hotel_model_size100_window10_mincount2.model')

# Reprezentacja wektorowa

In [16]:
model.wv.most_similar('good')

[('decent', 0.8306679129600525),
 ('great', 0.8200665712356567),
 ('excellent', 0.8017496466636658),
 ('ok', 0.6599850058555603),
 ('fair', 0.6276437044143677),
 ('nice', 0.626711368560791),
 ('average', 0.620361328125),
 ('reasonable', 0.6160991191864014),
 ('terrific', 0.6122583150863647),
 ('okay', 0.6103115677833557)]

In [17]:
model.wv.most_similar('bad')

[('terrible', 0.7221000790596008),
 ('horrible', 0.6743139028549194),
 ('awful', 0.6672260165214539),
 ('negative', 0.6333588361740112),
 ('disliked', 0.5975297689437866),
 ('poor', 0.5917552709579468),
 ('positive', 0.5701280236244202),
 ('ok', 0.5510519742965698),
 ('okay', 0.5496043562889099),
 ('lousy', 0.549447774887085)]

In [18]:
model.wv.most_similar(positive=['bad','terrible'], negative=['good'])

[('horrible', 0.8446500897407532),
 ('awful', 0.8094870448112488),
 ('horrific', 0.7092013359069824),
 ('dreadful', 0.7034502625465393),
 ('shocking', 0.6870557069778442),
 ('horrendous', 0.6752603054046631),
 ('appalling', 0.6659291386604309),
 ('worst', 0.6443156003952026),
 ('horrid', 0.6431555151939392),
 ('miserable', 0.6393977999687195)]

# Topic modeling - LDA

In [21]:
sentences_light = np.random.permutation(sentences)

In [22]:
sentences_light = sentences_light[:1000]

In [27]:
%time dictionary = Dictionary(sentences_light)

CPU times: user 84.8 ms, sys: 0 ns, total: 84.8 ms
Wall time: 83.8 ms


In [29]:
list(dictionary.values())

['adequatet',
 'available',
 'bart',
 'bathroom',
 'beautiful',
 'best',
 'breakfast',
 'bus',
 'business',
 'car',
 'channels',
 'choice',
 'choose',
 'clean',
 'conduct',
 'continental',
 'couse',
 'day',
 'decided',
 'definitely',
 'distance',
 'drug',
 'famous',
 'floor',
 'francisco',
 'free',
 'good',
 'great',
 'grocery',
 'helpful',
 'history',
 'hotel',
 'inn',
 'juices',
 'larger',
 'limited',
 'list',
 'lobby',
 'local',
 'location',
 'major',
 'mall',
 'monticello',
 'near',
 'needed',
 'nice',
 'overall',
 'people',
 'provided',
 'quiet',
 'refrigerator',
 'research',
 'restaurants',
 'room',
 'san',
 'shops',
 'short',
 'small',
 'sparse',
 'staff',
 'store',
 'stores',
 'th',
 'think',
 'time',
 'travel',
 'trolley',
 'tv',
 'updated',
 'variety',
 'walking',
 'wireless',
 'wish',
 'wrong',
 'klasse',
 'mar',
 'preis',
 'bon',
 'emplacement',
 'htel',
 'idal',
 'impecables',
 'pour',
 'prestations',
 'russir',
 'sjour',
 'son',
 'trs',
 'apologize',
 'apologizing',
 'arr

In [30]:
len(dictionary)

8969

In [31]:
dictionary.doc2bow(['car'])

[(9, 1)]

In [32]:
dictionary.token2id['car']

9

In [34]:
%time bow_corpus = [dictionary.doc2bow(sent) for sent in sentences_light]

CPU times: user 61.5 ms, sys: 0 ns, total: 61.5 ms
Wall time: 61.1 ms


In [35]:
%time lda_model = LdaMulticore(bow_corpus, id2word=dictionary, num_topics=20, passes=20, workers=8)

CPU times: user 29.8 s, sys: 1.62 s, total: 31.4 s
Wall time: 18.2 s


In [36]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {}, \nWords: {}'.format(idx, topic))

Topic: 0, 
Words: 0.031*"hotel" + 0.015*"room" + 0.009*"breakfast" + 0.008*"rooms" + 0.007*"stay" + 0.007*"staff" + 0.006*"night" + 0.006*"great" + 0.006*"stayed" + 0.005*"walk"
Topic: 1, 
Words: 0.017*"hotel" + 0.012*"room" + 0.008*"quot" + 0.007*"stayed" + 0.007*"rooms" + 0.006*"good" + 0.006*"new" + 0.006*"staff" + 0.005*"second" + 0.005*"comfortable"
Topic: 2, 
Words: 0.031*"hotel" + 0.014*"stay" + 0.012*"room" + 0.009*"staff" + 0.009*"rooms" + 0.009*"nice" + 0.008*"service" + 0.008*"great" + 0.007*"night" + 0.006*"amp"
Topic: 3, 
Words: 0.028*"hotel" + 0.019*"room" + 0.017*"location" + 0.016*"great" + 0.013*"stay" + 0.012*"staff" + 0.011*"rooms" + 0.009*"stayed" + 0.008*"nice" + 0.007*"good"
Topic: 4, 
Words: 0.034*"hotel" + 0.018*"room" + 0.015*"location" + 0.011*"great" + 0.007*"london" + 0.006*"staff" + 0.006*"stayed" + 0.006*"service" + 0.006*"stay" + 0.006*"bed"
Topic: 5, 
Words: 0.018*"hotel" + 0.012*"room" + 0.011*"service" + 0.008*"stay" + 0.008*"time" + 0.006*"staff" + 0.

# Wizualizacja tematów

In [37]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
