[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110XwsAvyjjGKVjsenR98r4AoFohbFSZH)


## Preparing Dataset

*NOTICE: fetching Dataset is not working colab (due to sanctions or what) so I provided the corpus alongside the notebook. You can either fetch dataset using colab via connecting it to a local machine or simply use the corpus.txt .*

### Method 1: Fetching Dataset

#### Importing Libraries

In [None]:
import requests
from bs4 import BeautifulSoup

#### Setting URL

In [None]:
URL = "https://ganjoor.net/moulavi/shams/ghazalsh/sh"
ghazalha = range(1501, 2001)
corpus = ""

##### Webscraping Data From Ganjoor

In [None]:
for ghazal in ghazalha:
    page = requests.get(URL+str(ghazal))

    soup = BeautifulSoup(page.content, "html.parser")

    results = soup.find(id="garticle")

    beitha = results.find_all("div", class_="b")

    print("غزل شماره " + str(ghazal))
    for beit in beitha:
        mesra1 = beit.find("div", class_="m1")
        mesra2 = beit.find("div", class_="m2")
        print(mesra1.text)
        corpus += mesra1.text + " "
        print(mesra2.text)
        corpus += mesra2.text + " "
        print()
    print()

#### Saving Data in a Text File (optional)

In [None]:
text_file = open("corpus.txt", "w", encoding="utf-8")
text_file.write(corpus)
text_file.close()

In [None]:
corpus

### Method 2: Use pre-made dataset

Downloading corpus and stopwords from github

In [1]:
!curl -o corpus.txt https://raw.githubusercontent.com/AmirHajimohamadi/nlp/master/corpus.txt

with open('corpus.txt', 'r') as file:
    corpus = file.read().rstrip()
file.close()


!curl -o stopwords.txt https://raw.githubusercontent.com/AmirHajimohamadi/nlp/master/stopwords.txt

with open('stopwords.txt', 'r') as file:
    stopwords = file.read().rstrip()
file.close()


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  585k  100  585k    0     0   930k      0 --:--:-- --:--:-- --:--:--  929k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  6131  100  6131    0     0  36064      0 --:--:-- --:--:-- --:--:-- 36064


### Preprocessing the corpus

In [2]:
import numpy as np
from tqdm import tqdm

In [3]:
stopwords = stopwords.replace("\n", " ")
stopwords = stopwords.split()

In [4]:
text = corpus.split()
text = [w.replace('\u200c', '') for w in text]

Removing stopwords from the text

In [5]:
text = [w for w in text if not w in stopwords]

### Declare some variables

In [6]:
# Defining the window for context
window = 2
# Creating a placeholder for the scanning of the word list
word_lists = []

getting context words and main words:

In [7]:
for i, word in enumerate(text):
  for w in range(window):
    # Getting the context that is ahead by *window* words
    if i + 1 + w < len(text): 
      word_lists.append([word] + [text[(i + 1 + w)]])
    # Getting the context that is behind by *window* words
    if i - w - 1 >= 0:
      word_lists.append([word] + [text[(i - w - 1)]])

A dictionary of unique words in corpus:

In [8]:
def create_unique_word_dict(text:list) -> dict:
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    words = list(set(text))
    words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word: i
        })

    return unique_word_dict    

### Cooking the model food

In [9]:
unique_word_dict = create_unique_word_dict(text)

# Defining the number of features (unique words)
n_words = len(unique_word_dict)

# Getting all the unique words 
words = list(unique_word_dict.keys())

# Creating the X and Y matrices using one hot encoding
X = []
Y = []

Making X and Y matrices

In [10]:
for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])

    # Creating the placeholders   
    X_row = np.zeros(n_words, dtype=np.int8)
    Y_row = np.zeros(n_words, dtype=np.int8)

    # One hot encoding the main word
    X_row[main_word_index] = 1

    # One hot encoding the Y matrix words 
    Y_row[context_word_index] = 1

    # Appending to the main matrices
    X.append(X_row)
    Y.append(Y_row)


221942it [00:04, 47215.00it/s]


In [11]:
X = np.array(X, dtype=np.int8)
Y = np.array(Y, dtype=np.int8)

### Training the Model

In [12]:
from keras.models import Input, Model
from keras.layers import Dense

# Defining the size of the embedding (dimensions)
embed_size = 100

# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=['categorical_accuracy'])

In [13]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10944)]           0         
                                                                 
 dense (Dense)               (None, 100)               1094500   
                                                                 
 dense_1 (Dense)             (None, 10944)             1105344   
                                                                 
Total params: 2,199,844
Trainable params: 2,199,844
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Optimizing the network weights
model.fit(
    x=X, 
    y=Y, 
    batch_size=512,
    epochs=50
    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ff13a3d6350>

In [15]:
model.save("network.h5")

In [None]:
# from keras.models import load_model
# loaded_model = load_model("network.h5")
# loss, accuracy = loaded_model.evaluate(test_data, test_targets)

In [16]:
# Obtaining the weights from the neural network. 
# These are the so called word embeddings

# The input layer 
weights = model.get_weights()[0]

# Creating a dictionary to store the embeddings in. The key is a unique word and 
# the value is the numeric vector
embedding_dict = {}
for word in words: 
    embedding_dict.update({
        word: weights[unique_word_dict.get(word)]
        })

This function will create a sorted dictionary of most similar vectors(words) with the value of similaritis.

NOTICE: The spatial.cosine.distance() function from the scipy module calculates the distance instead of the cosine similarity, but to achieve that, we can subtract the value of the distance from 1.

In [17]:
from scipy import spatial

def near_words(word):
  word_dimensions = weights[unique_word_dict.get(word)]
  temp_dict = embedding_dict.copy()
  for word in words: 
      temp_dict.update({
          word: (1 - spatial.distance.cosine(word_dimensions , weights[unique_word_dict.get(word)]))
          })
  sorted_near_words = sorted(temp_dict.items(), key=lambda kv: kv[1], reverse = True)
  return sorted_near_words

In [32]:
near_words("دشمن")

[('دشمن', 1.0),
 ('دوستکام', 0.500657856464386),
 ('پرمهر', 0.4304129183292389),
 ('رهیم', 0.4125579297542572),
 ('دشمنم', 0.41054874658584595),
 ('سلامت', 0.40794193744659424),
 ('ضد', 0.4078054130077362),
 ('رقیب', 0.3997301161289215),
 ('چاشتم', 0.3961251676082611),
 ('کلی', 0.3872118592262268),
 ('داروم', 0.3695327341556549),
 ('لقمان', 0.36836540699005127),
 ('قلتبان', 0.36455094814300537),
 ('پیغامبران', 0.3610052168369293),
 ('ولیک', 0.35999372601509094),
 ('بمیرد', 0.35827508568763733),
 ('دلتنگ', 0.345121294260025),
 ('عجبی', 0.3450860381126404),
 ('ذوفنونم', 0.34433621168136597),
 ('زهری', 0.3406078517436981),
 ('تعب', 0.3402869999408722),
 ('ندانستم', 0.3402697741985321),
 ('خال', 0.33894842863082886),
 ('غمهایی', 0.33642899990081787),
 ('مبر', 0.33570724725723267),
 ('بامش', 0.3301372230052948),
 ('بردهست', 0.32786428928375244),
 ('بیدهان', 0.3245082199573517),
 ('فسوس', 0.3228190839290619),
 ('پربند', 0.3227880597114563),
 ('بازیهای', 0.3227103650569916),
 ('مهرهای', 0.321