<a href="https://colab.research.google.com/github/AUT-Student/NLP-HW2/blob/main/NLP_HW2_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
!pip install gensim --upgrade

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 77.0 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


In [66]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec, TfidfModel, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim import corpora

from sklearn.metrics.pairwise import cosine_similarity

# Dataset

In [3]:
!gdown 1-86CqCHek-U1iH5nW30RfnFU0PYmdhKB

Downloading...
From: https://drive.google.com/uc?id=1-86CqCHek-U1iH5nW30RfnFU0PYmdhKB
To: /content/train.csv
100% 20.1M/20.1M [00:00<00:00, 73.9MB/s]


In [4]:
!gdown 1YzRlYyye_KoEw7_q9NARiCwl3Cn-EH3J

Downloading...
From: https://drive.google.com/uc?id=1YzRlYyye_KoEw7_q9NARiCwl3Cn-EH3J
To: /content/test.csv
  0% 0.00/2.03M [00:00<?, ?B/s]100% 2.03M/2.03M [00:00<00:00, 127MB/s]


In [5]:
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")

# Word Representation

In [6]:
train_sentences = train_data["article"].values

In [7]:
train_sentences = [sentence.split() for sentence in train_sentences]

In [65]:
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=300,
                          sg=1, min_count=1, epochs=5)

# Document Representation

## TF-IDF

In [68]:
dictionary = corpora.Dictionary()

In [69]:
train_sentences_bow = [dictionary.doc2bow(sentence, allow_update=True) for sentence in train_sentences]

In [70]:
tfidf_model = TfidfModel(train_sentences_bow)

## Word2Vec & TF-IDF

In [71]:
class Word2VecTfIdf():
  def __init__(self, word2vec_model, tfidf_model, dictionary):
    self.word2vec_model = word2vec_model
    self.tfidf_model = tfidf_model
    self.dictionary = dictionary

  def text_vector(self, text):
    text_words = text.split()
    text_bow = self.dictionary.doc2bow(text_words)
    tfidf_output = tfidf_model[text_bow]

    sum_weight = 0
    sum_vector = np.zeros(self.word2vec_model.vector_size)

    for id, weight in tfidf_output:
      word = self.dictionary[id]
      word_vector = self.word2vec_model.wv[word]

      sum_vector += weight * word_vector
      sum_weight += weight

    avg_vector = (1/sum_weight) * sum_vector
    return avg_vector

In [72]:
word2vec_tfidf_model = Word2VecTfIdf(word2vec_model=word2vec_model,
                                     tfidf_model=tfidf_model,
                                     dictionary=dictionary)

## Doc2Vec

In [73]:
train_sentences_tagged = [TaggedDocument(sentence, [i]) for i, sentence in enumerate(train_sentences)]

In [74]:
doc2vec_model = Doc2Vec(documents=train_sentences_tagged, epochs=5, vector_size=300)

# Similar Documents

In [76]:
tfidf_vectors = {}
doc2vec_vectors = {}

In [77]:
for i, data in train_data.iterrows():
  doc_id = data["id"]
  text = data["article"]

  tfidf_vector = word2vec_tfidf_model.text_vector(text)
  tfidf_vectors[doc_id] = tfidf_vector

  doc2vec_vector = doc2vec_model.dv[i]
  doc2vec_vectors[doc_id] = doc2vec_vector

In [111]:
for test_doc_id in ["Doc1", "Doc3", "Doc5", "Doc25", "Doc36"]:
  print(f"Doc ID = {test_doc_id}")
  
  test_text = test_data[test_data["id"]==test_doc_id]["article"].values[0]

  print(f"Original Text\n {test_text}")

  test_tfidf_vector = word2vec_tfidf_model.text_vector(test_text)
  test_doc2vec_vector = doc2vec_model.infer_vector(test_text.split())

  best_tfidf_id = None
  best_tfidf_score = 0

  best_doc2vec_id = None
  best_doc2vec_score = 0

  for train_doc_id in train_data["id"].values:
    new_tfidf_score = cosine_similarity([tfidf_vectors[train_doc_id]], [test_tfidf_vector])[0]
    new_doc2vec_score = cosine_similarity([doc2vec_vectors[train_doc_id]], [test_doc2vec_vector])[0]
    
    if new_tfidf_score > best_tfidf_score:
      best_tfidf_score = new_tfidf_score
      best_tfidf_id = train_doc_id

    if new_doc2vec_score > best_doc2vec_score:
      best_doc2vec_score = new_doc2vec_score
      best_doc2vec_id = train_doc_id

  predicted_text_tfidf = train_data[train_data["id"]==best_tfidf_id]["article"].values[0]
  predicted_text_doc2vec = train_data[train_data["id"]==best_doc2vec_id]["article"].values[0]

  print("\nWord2Vec & TF-IDF:")
  print(f"Predicted Doc ID = {best_tfidf_id}")
  print(f"Score = {best_tfidf_score}")
  print(f"Predicted Text = \n {predicted_text_tfidf}")

  print("\nDoc2Vec:")
  print(f"Predicted Doc ID = {best_doc2vec_id}")
  print(f"Score = {best_doc2vec_score}")
  print(f"Predicted Text = \n {predicted_text_doc2vec}")
  print("\n##################################################")

Doc ID = Doc1
Original Text
 # روزي اينجا شايد كاروانسرايي بود. # # يادداشتي بر مجموعه تلويزيوني « هتل » صغري آقااحمدي # 1 - از كاروانسرا تا هتل در گذشته : در عصر ماقبل رسانه ها و عصر ماقبل ارتباطات، يكي از راههاي ارتباط و انتقال انديشه و فرهنگ و زبان و نقل اخبار اتراق كردن در كاروانسراها و قهوه خانه هايي بود كه در مسير كاروانها ساخته مي شدند. پيشه وران، بازرگانان، اديبان، سخنوران، قاصدان و هنرمندان از جمله كساني بودند كه زمان استراحت در اين مكانها اغلب گرد هم جمع مي شدند و از رموز فن و علم و هنر و زبان همديگر آگاهي پيدا مي كردند و هر يك ره توشه اي از اين كالاي گرانبها با خود مي بردند. حال بعد از گذشت سالياني دراز، وقتي كاروانسراها و قهوه خانه ها تبديل به هتل هاي شيك و مسافرخانه هاي امروزي شد، ديگر هيچ حرفي و حركتي از آن ارتباط و انتقال انديشه و فرهنگ و زبان در كار نيست. اين مكانها مجموعه هاي دربسته اي شدند كه تنها براي دمي آساييدن مهيا شده اند و بس، مكانهايي با خوش ساخت ترين شكل و نما و سرويس هاي خدماتي آنچناني. 2 - هتل و چهره ها : مرضيه برومند در سريال « هتل » شايد در تلاش بوده كه با

# Similar Words