In [22]:
import gensim

assert gensim.models.doc2vec.FAST_VERSION > -1

In [23]:
import gensim
import gensim.test.utils

# Set file names for train and test data
lee_train_file = gensim.test.utils.datapath('lee_background.cor')
lee_test_file = gensim.test.utils.datapath('lee.cor')

In [24]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

print(train_corpus[2])
print(test_corpus[2])

TaggedDocument<['the', 'national', 'road', 'toll', 'for', 'the', 'christmas', 'new', 'year', 'holiday', 'period', 'stands', 'at', 'eight', 'fewer', 'than', 'for', 'the', 'same', 'time', 'last', 'year', 'people', 'have', 'died', 'on', 'new', 'south', 'wales', 'roads', 'with', 'eight', 'fatalities', 'in', 'both', 'queensland', 'and', 'victoria', 'western', 'australia', 'the', 'northern', 'territory', 'and', 'south', 'australia', 'have', 'each', 'recorded', 'three', 'deaths', 'while', 'the', 'act', 'and', 'tasmania', 'remain', 'fatality', 'free'], [2]>
['the', 'united', 'states', 'government', 'has', 'said', 'it', 'wants', 'to', 'see', 'president', 'robert', 'mugabe', 'removed', 'from', 'power', 'and', 'that', 'it', 'is', 'working', 'with', 'the', 'zimbabwean', 'opposition', 'to', 'bring', 'about', 'change', 'of', 'administration', 'as', 'scores', 'of', 'white', 'farmers', 'went', 'into', 'hiding', 'to', 'escape', 'round', 'up', 'by', 'zimbabwean', 'police', 'senior', 'bush', 'administrat

In [25]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model

<gensim.models.doc2vec.Doc2Vec at 0x163a59a50>

In [26]:
model.build_vocab(train_corpus)

In [27]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [28]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x12ec01950>

In [29]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-2.3635106e-01 -3.2055363e-01 -7.6372109e-02  1.1140015e-01
  1.1086052e-01 -1.2266620e-01 -5.4258130e-02  1.6664676e-04
 -3.0656746e-01 -1.5667590e-01  1.7392519e-01 -9.2228957e-02
  1.6458532e-02 -4.1063368e-02 -1.6505621e-01 -1.9231008e-01
  8.3026715e-02  1.7636187e-01  5.5203233e-02 -5.2253019e-02
  5.9847429e-02  1.3428758e-02  1.2728877e-01 -6.4704165e-02
  9.8073110e-03 -9.7042799e-02 -1.9156255e-01 -5.4943562e-02
 -1.0790176e-01 -7.5596102e-02  3.2569042e-01 -5.4641094e-02
  9.8457046e-02  1.0474536e-01  1.5373200e-01  1.0982167e-01
 -1.2166955e-03 -1.6161810e-01 -1.1089631e-01  6.2733620e-02
 -7.4537322e-02 -8.6963400e-02  3.3904400e-02  2.1911137e-02
 -3.7665091e-02  2.6002578e-02 -1.2024457e-01 -3.0789163e-02
  1.4448050e-01 -6.8067797e-02]


In [30]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 291, 1: 9})


In [31]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (37): «robert mugabe strengthened his hold on the zimbabwean government yesterday by retaining the most combative hardliner ministers in cabinet shuffle which offered little hope of moderation of the land seizures and other policies that have kept zimbabwe in crisis and brought international condemnation»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (94, 0.7746660709381104): «foreign minister alexander downer says the commonwealth democracy watchdog should put zimbabwe formally on its agenda in the first step to possible suspension from the organisation mr downer says ministers from the commonwealth ministerial action group cmag should review whether the reported violence and intimidation in zimbabwe means it has violated the commonwealth code of good governance cmag ministers from australia bangladesh barbados botswana britain canada malaysia and nigeria will meet in london tomorrow for talks on zimbabwe in recent meetings they have sus

## Exercises

In [32]:
!pip install fastparquet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


#### Task 0. Train your own doc2vec model on a test dataset. Most of the example files use Parquet file format. A short guide below.

In [47]:
import pandas as pd

file = "test-00000-of-00001 (1).parquet"
data = pd.read_parquet(file)
data

Unnamed: 0,text,label
0,Fears for T N pension after talks Unions repre...,2
1,The Race is On: Second Private Team Sets Launc...,3
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3
...,...,...
7595,Around the world Ukrainian presidential candid...,0
7596,Void is filled with Clement With the supply of...,1
7597,Martinez leaves bitter Like Roger Clemens did ...,1
7598,5 of arthritis patients in Singapore take Bext...,2


In [50]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

In [51]:
# Перетворення текстів у TaggedDocument (токенізовані слова + унікальний ідентифікатор)
tagged_data = [
    TaggedDocument(
        words=simple_preprocess(text),  # Розбиває текст на слова (токени)
        tags=[str(i)]                  # Унікальний тег (тут — індекс рядка)
    )
    for i, text in enumerate(data['text'])
]

In [53]:
# Створюємо та навчаємо модель

# Ініціалізація моделі
model = Doc2Vec(
    vector_size=100,    # Розмір вектора документа
    min_count=2,        # Ігноруємо слова, що зустрічаються < 2 разів
    epochs=30,          # Кількість епох навчання
    dm=1,               # Режим Distributed Memory (DM)
    workers=4           # Кількість потоків
)

# будуємо словник
model.build_vocab(tagged_data)

# навчаємо модель на даних
model.train(
    tagged_data,
    total_examples=model.corpus_count,
    epochs=model.epochs
)

In [54]:
# зберігаємо та завантажуємо модель

model.save("doc2vec_model.model")
loaded_model = Doc2Vec.load("doc2vec_model.model")

In [57]:
# шукаємо схожі документи (5)

similar_docs = model.dv.most_similar('0', topn=5)
for doc_id, similarity in similar_docs:
    print(f"Doc {doc_id}: Similarity = {similarity:.2f}")
    print(data['text'][int(doc_id)][:100] + "...")  # Вивід уривка тексту

Doc 7312: Similarity = 0.69
Boycott Rethink After Ahern Apology The DUP was last night reconsidering its boycott of talks with t...
Doc 6292: Similarity = 0.69
SKorea's Hyundai Motor to join strike against labor reform bill (AFP) AFP - Workers at South Korea's...
Doc 867: Similarity = 0.68
Federal-Mogul May Sell Turner  amp; Newall Assets, Independent Says Federal-Mogul Corp., the bankrup...
Doc 7309: Similarity = 0.68
Vodafone Drops on Report It Supports Bid for Sprint (Update2) Shares in Vodafone Group Plc, the worl...
Doc 2906: Similarity = 0.67
Bush, Kerry Differ on Approach to North Korea  SEOUL (Reuters) - The determination of North Korea to...


In [58]:
# Визначаємо вектор для нового тексту

new_text = "New example text about wildfires and climate change"
new_vector = model.infer_vector(simple_preprocess(new_text))
print(new_vector)

[-0.38286534 -0.01745132  0.36554155 -0.08339477  0.18552297 -0.21808296
  0.02624688  0.276265   -0.2699174   0.04848212  0.04570699 -0.07405192
 -0.02545081  0.09969936 -0.17319246 -0.36024502  0.19452754  0.02028139
 -0.3541433  -0.2425142   0.42800122  0.4070015  -0.01218341 -0.06962993
  0.18593952  0.17270583 -0.05057585  0.05607817 -0.02047254 -0.1771728
  0.3346613   0.13670565  0.08814178  0.11999455 -0.19903794  0.20383206
  0.20657356 -0.11939052 -0.29126722 -0.21660599 -0.13425359 -0.08689398
 -0.03980897  0.04431986  0.12863876 -0.32491133 -0.1935405  -0.34181622
 -0.01381759  0.21859188 -0.0831366  -0.088429   -0.10675119 -0.01345019
 -0.19389752  0.14209256  0.05664982  0.05360454 -0.12095398  0.01051721
  0.18500876  0.33576483 -0.26584503 -0.28744158  0.05200316  0.06103697
  0.03993298  0.00262939 -0.18682721  0.06208413 -0.30973023  0.46853995
  0.02071415 -0.05862995  0.31129003  0.09843455  0.21396382 -0.19515827
 -0.18033326 -0.11583228  0.04850967  0.27286905 -0.

In [None]:
file = "test-00000-of-00001 (1).parquet"
data = pd.read_parquet(file)
data

#### Task 1. Practice finding similar documents/articles/posts. Assess validity of the model.

Практика пошуку схожих документів/статей/постів. Оцінка точності моделі.

In [73]:
# завантажуємо навчальну модель

from gensim.models import Doc2Vec

model = Doc2Vec.load("doc2vec_model.model")

In [60]:
# пошук схожих документів до першого тексту в датасеті 

similar_docs = model.dv.most_similar("0", topn=5)  

print(f"Оригінальний документ (ID 0):\n{data['text'][0][:200]}...\n")  # Вивід уривка
print("Схожі документи:")
for doc_id, similarity in similar_docs:
    print(f"\nID: {doc_id}, Similarity: {similarity:.3f}")
    print(data['text'][int(doc_id)][:200] + "...")  # Вивід уривка схожого тексту

Оригінальний документ (ID 0):
Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul....

Схожі документи:

ID: 7312, Similarity: 0.695
Boycott Rethink After Ahern Apology The DUP was last night reconsidering its boycott of talks with the Irish government after Taoiseach Bertie Ahern apologised to party leader Ian Paisley....

ID: 6292, Similarity: 0.689
SKorea's Hyundai Motor to join strike against labor reform bill (AFP) AFP - Workers at South Korea's largest automaker Hyundai Motor will go on strike Friday to oppose proposed government labor reform...

ID: 867, Similarity: 0.680
Federal-Mogul May Sell Turner  amp; Newall Assets, Independent Says Federal-Mogul Corp., the bankrupt US engineering company, may sell its UK-based Turner  amp; Newall Plc after the UK division #39;s ...

ID: 7309, Similarity: 0.679
Vodafone Drops on Report It Supports Bid for Sprint (Update2) Shares in 

In [66]:
# Створюємо вектор для довільного тексту та знаходимо документи з датасету, близькі до нього:

new_text = "Climate change increases wildfires risks around the world"
inferred_vector = model.infer_vector(simple_preprocess(new_text))
similar_docs = model.dv.most_similar([inferred_vector], topn=3)

print(f"\nТекст: '{new_text}'\n")
for doc_id, similarity in similar_docs:
    print(f"ID: {doc_id}, Similarity: {similarity:.3f}")
    print(data['text'][int(doc_id)][:200] + "...")


Текст: 'Climate change increases wildfires risks around the world'

ID: 93, Similarity: 0.821
Fund pessimism grows NEW YORK (CNN/Money) - Money managers are growing more pessimistic about the economy, corporate profits and US stock market returns, according to a monthly survey by Merrill Lynch...
ID: 5836, Similarity: 0.802
Stronger Sales Boost JC Penney Earnings (Reuters) Reuters - Department store operator J.C. Penney\Co. Inc.  on Tuesday said third-quarter profit rose 86.3\percent, helped by stronger sales and fewer m...
ID: 5842, Similarity: 0.799
Stocks Open Lower; Inflation Data Weighs  NEW YORK (Reuters) - U.S. stocks opened lower on Tuesday  after a government report showing a much larger-than-expected  rise in U.S. producer prices in Octob...


In [67]:
# Якщо датасет містить мітки (наприклад, label=3), перевіримо, чи схожі документи належать до тієї ж категорії:

original_label = data['label'][0]  
print(f"\nМітка оригінального документу (ID 0): {original_label}")

for doc_id, similarity in similar_docs:
    doc_label = data['label'][int(doc_id)]
    print(f"ID {doc_id}: Мітка {doc_label}, Схожість {similarity:.3f}")


Мітка оригінального документу (ID 0): 2
ID 93: Мітка 2, Схожість 0.821
ID 5836: Мітка 2, Схожість 0.802
ID 5842: Мітка 2, Схожість 0.799


In [68]:
# Рахуємо, який відсоток схожих документів має ту саму мітку:
matches = 0
total = len(similar_docs)

for doc_id, _ in similar_docs:
    if data['label'][int(doc_id)] == original_label:
        matches += 1

print(f"\nВідсоток документів з тією ж міткою: {matches/total:.1%}")


Відсоток документів з тією ж міткою: 100.0%


In [75]:
from sklearn.ensemble import RandomForestClassifier

# Вектори документів
X = [model.dv[str(i)] for i in range(len(data))]
y = data['label']

# Навчання класифікатора
clf = RandomForestClassifier()
clf.fit(X, y)
print(clf) 

RandomForestClassifier()
