<a href="https://colab.research.google.com/github/vifirsanova/compling/blob/main/word2vec/gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

In [7]:
file_path = 'rainbowvalley_pars.txt'

In [8]:
with open("rainbowvalley_pars.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [9]:
sentences = [simple_preprocess(sentence) for sentence in text.splitlines() if sentence]

In [10]:
model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

In [12]:
model.wv.most_similar("anne", topn=5)

[('susan', 0.9997088313102722),
 ('my', 0.9996336102485657),
 ('jerry', 0.9996272325515747),
 ('or', 0.9996233582496643),
 ('such', 0.9995993375778198)]

In [13]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
sentences = [
    [word for word in simple_preprocess(sentence) if word not in stop_words]
    for sentence in text.splitlines() if sentence
]

In [None]:
model = Word2Vec(sentences, vector_size=150, window=10, min_count=5, workers=4)

In [None]:
model.wv.most_similar("cat", topn=20)

[('said', 0.9718322157859802),
 ('little', 0.9703713655471802),
 ('alice', 0.9700632691383362),
 ('would', 0.9621873497962952),
 ('like', 0.962085485458374),
 ('one', 0.9614766836166382),
 ('began', 0.9606907367706299),
 ('say', 0.9589781761169434),
 ('moment', 0.95771723985672),
 ('voice', 0.9570156335830688),
 ('back', 0.9562947750091553),
 ('went', 0.9559786915779114),
 ('know', 0.9552865624427795),
 ('great', 0.9550570249557495),
 ('could', 0.9549130201339722),
 ('another', 0.954680860042572),
 ('eyes', 0.954626202583313),
 ('next', 0.9545128345489502),
 ('quite', 0.9543935656547546),
 ('looked', 0.9543210864067078)]

In [15]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

In [16]:
dictionary = Dictionary(sentences)
corpus = [dictionary.doc2bow(sentence) for sentence in sentences]

In [17]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [18]:
corpus

[[(0, 1)],
 [(1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1)],
 [(13, 1), (14, 1)],
 [(15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 2),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1)],
 [],
 [(48, 2), (49, 2), (50, 1)],
 [(51, 1)],
 [],
 [],
 [(10, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)],
 [(59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)],
 [(67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1)],
 [(57, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1)],
 [(84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1)],
 [(35, 1),
  (79, 1)

In [19]:
topics = lda_model.print_topics(num_words=5)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.028*"miss" + 0.027*"said" + 0.015*"cornelia" + 0.013*"think" + 0.011*"us"
Topic 1: 0.029*"rosemary" + 0.020*"ellen" + 0.019*"meredith" + 0.016*"west" + 0.015*"said"
Topic 2: 0.016*"felt" + 0.011*"never" + 0.011*"marry" + 0.010*"see" + 0.010*"little"
Topic 3: 0.020*"mrs" + 0.018*"like" + 0.016*"valley" + 0.016*"mary" + 0.013*"manse"
Topic 4: 0.023*"would" + 0.022*"father" + 0.015*"said" + 0.014*"una" + 0.012*"faith"


In [20]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [21]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

lda_display = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)

pyLDAvis.display(lda_display)

In [22]:
pyLDAvis.save_html(lda_display, 'lda_visualization.html')

  and should_run_async(code)
