<a href="https://colab.research.google.com/github/AUT-Student/NLP-HW2/blob/main/NLP_HW2_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
!pip install gensim --upgrade

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 77.0 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


In [41]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, TfidfModel, Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from gensim import corpora

# Dataset

In [3]:
!gdown 1-86CqCHek-U1iH5nW30RfnFU0PYmdhKB

Downloading...
From: https://drive.google.com/uc?id=1-86CqCHek-U1iH5nW30RfnFU0PYmdhKB
To: /content/train.csv
100% 20.1M/20.1M [00:00<00:00, 73.9MB/s]


In [4]:
!gdown 1YzRlYyye_KoEw7_q9NARiCwl3Cn-EH3J

Downloading...
From: https://drive.google.com/uc?id=1YzRlYyye_KoEw7_q9NARiCwl3Cn-EH3J
To: /content/test.csv
  0% 0.00/2.03M [00:00<?, ?B/s]100% 2.03M/2.03M [00:00<00:00, 127MB/s]


In [5]:
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")

# Word Representation

In [6]:
train_sentences = train_data["article"].values

In [7]:
train_sentences = [sentence.split() for sentence in train_sentences]

In [8]:
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=300,
                          sg=1, min_count=5, epochs=5)

In [20]:
word2vec_model.wv.similarity("افغانستان", "طالبان")

0.6838624

# Document Representation

## TF-IDF

In [9]:
dictionary = corpora.Dictionary()

In [10]:
train_sentences_bow = [dictionary.doc2bow(sentence, allow_update=True) for sentence in train_sentences]

In [11]:
tfidf_model = TfidfModel(train_sentences_bow)

## Word2Vec & TF-IDF

In [23]:
class Word2VecTfIdf():
  def __init__(self, word2vec_model, tfidf_model, dictionary):
    self.word2vec_model = word2vec_model
    self.tfidf_model = tfidf_model
    self.dictionary = dictionary

  def text_vector(self, text):
    text_words = text.split()
    text_bow = self.dictionary.doc2bow(text_words)
    tfidf_output = tfidf_model[text_bow]

    sum_weight = 0
    sum_vector = np.zeros(self.word2vec_model.vector_size)

    for id, weight in tfidf_output:
      word = self.dictionary[id]
      word_vector = self.word2vec_model.wv[word]

      sum_vector += weight * word_vector
      sum_weight += weight

    avg_vector = (1/sum_weight) * sum_vector
    return avg_vector

In [24]:
word2vec_tfidf_model = Word2VecTfIdf(word2vec_model=word2vec_model,
                                     tfidf_model=tfidf_model,
                                     dictionary=dictionary)

In [25]:
word2vec_tfidf_model.text_vector("سلام بر تهران")

array([ 0.07165736, -0.16287823,  0.18894456, -0.08975858,  0.1851432 ,
       -0.06206486, -0.2238442 ,  0.15303521, -0.19987583,  0.03111687,
       -0.12693394, -0.36220636, -0.30669092, -0.00508878, -0.18143595,
       -0.10455818,  0.18004482, -0.16930764,  0.20204514, -0.01084684,
       -0.13017184,  0.27933639, -0.13327998,  0.10063555,  0.14042927,
        0.13684876, -0.10849461, -0.01686775,  0.05600927, -0.13302501,
        0.14098935,  0.06829945,  0.1010519 ,  0.20734795, -0.15859542,
       -0.06175493, -0.23780326, -0.1062687 , -0.20102134, -0.05054218,
       -0.27875928, -0.26399201, -0.01806605,  0.13169912,  0.00502291,
        0.25698807,  0.095621  ,  0.33086066, -0.12244385,  0.24503682,
       -0.00777197, -0.05105306, -0.05346265,  0.00223424, -0.07765244,
        0.19360891,  0.06265177, -0.03979613, -0.12131742, -0.1281465 ,
       -0.20557971, -0.03306543, -0.10176163,  0.15945475,  0.05925942,
       -0.04444083, -0.06707236,  0.19054741, -0.0592723 , -0.17

## Doc2Vec

In [42]:
train_sentences_tagged = [TaggedDocument(sentence, [i]) for i, sentence in enumerate(train_sentences)]

In [53]:
doc2vec_model = Doc2Vec(documents=train_sentences_tagged, epochs=5, vector_size=300)

In [54]:
doc2vec_model.infer_vector("سلام بر تهران".split())

array([-2.27902783e-03, -1.10739460e-02, -1.37836309e-02,  1.03726555e-02,
       -9.29422397e-03,  1.22267578e-03, -3.13767381e-02, -1.26775140e-02,
       -5.06480131e-03, -1.01340720e-02,  9.76915285e-03,  1.38384094e-02,
        5.90620935e-03,  2.20404076e-03, -3.33695151e-02, -3.38838920e-02,
       -2.58058752e-03,  2.04575248e-02, -2.24750768e-02, -9.80483927e-03,
       -1.97115783e-02,  1.38844512e-02,  6.85863383e-03,  1.44319097e-03,
       -8.63018818e-03, -1.31951254e-02, -8.21294452e-05, -1.99835263e-02,
       -3.38190002e-03,  1.38525646e-02, -8.20886996e-03,  2.52831355e-02,
        1.31483078e-02, -2.89827934e-03, -2.25492567e-02,  2.54936572e-02,
       -6.71587419e-03, -4.43829112e-02,  3.61410901e-03, -1.06360379e-03,
       -1.50249656e-02, -4.56468575e-02, -5.71803888e-03, -3.24600399e-03,
       -8.10267311e-03, -5.57684712e-03, -1.24887107e-02, -1.16336476e-02,
       -2.61588003e-02, -3.15562040e-02,  4.41327738e-03,  2.17832979e-02,
       -2.41176784e-02, -