In [70]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
from nltk.stem.isri import ISRIStemmer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Using CUDA")
else:
    print("Using CPU")

Using CUDA


In [71]:
stemmer = ISRIStemmer()
w = "انتزاعا"
print(stemmer.stem(w))

نزع


In [72]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        self.activation = nn.LogSoftmax(dim=-1)

    def forward(self, inputs):
        embeds = torch.sum(self.embeddings(inputs), dim=1)
        out = self.linear(embeds)
        out = self.activation(out)
        return out

In [73]:
class CBOWDataset(Dataset):
    def __init__(self, text, word2idx, window_size, lengths):
        super(CBOWDataset, self).__init__()
        # keep text_encoded 2d
        self.text_encoded = [[word2idx[word] if word in word2idx else word2idx['<UNK>'] for word in sentence] for sentence in text]
        self.text_encoded = [torch.tensor(sentence, device=device) for sentence in self.text_encoded]
        self.window_size = window_size
        self.lengths = lengths

    def __getitem__(self, idx):
        sentence_idx = np.searchsorted(self.lengths, idx, side='right')
        idx = idx - self.lengths[sentence_idx - 1] if sentence_idx > 0 else idx
        center_word = self.text_encoded[sentence_idx][idx]
        start_idx = idx - self.window_size if (idx - self.window_size) > 0 else 0
        end_idx = idx + self.window_size
        before_context = self.text_encoded[sentence_idx][start_idx:idx]
        after_context = self.text_encoded[sentence_idx][idx + 1:end_idx + 1]
        if len(before_context) < self.window_size:
            before_context = torch.cat((torch.tensor([word2idx['<S>']] * (self.window_size - len(before_context)), device=device), before_context))
        if len(after_context) < self.window_size:
            after_context = torch.cat((after_context, torch.tensor([word2idx['</S>']] * (self.window_size - len(after_context)), device=device)))
        context = torch.cat((before_context, after_context))
        return context, center_word

    def __len__(self):
        return self.lengths[-1]

In [74]:
# A function to get the max length that will cover 99% of the data
def get_max_len(text):
    lengths = [len(sentence.split()) for sentence in text]
    return np.percentile(lengths, 99)

In [75]:
# text = """We are about to study the idea of a computational process. Computational processes are abstract beings that inhabit computers.
# As they evolve, processes manipulate other abstract things called data. The evolution of a process is directed by a pattern of rules called a program.
# People create programs to direct processes. In effect, we conjure the spirits of the computer with our spells."""
with open("../clean_out/merged_unsplited.txt", "r", encoding="utf8") as f:
    text = f.read()

# replace , and - with space
text = text.replace("،", "")
text = text.replace("-", "")
# Split into sentences
text = text.split("\n")
# remove sentences with length more than 99% of the data
# max_len = get_max_len(text)
# text = [sentence for sentence in text if len(sentence.split()) <= max_len]
# make all sentences with same length by padding with <PAD>
# max_len = max([len(sentence.split()) for sentence in text])
# text = [sentence + " <PAD>" * (max_len - len(sentence.split())) for sentence in text]
# Split into words
text = [sentence.split() for sentence in text]
# get array of length of all sentences
lengths = [len(sentence) for sentence in text]
# prefix sum of lengths
lengths = np.cumsum(lengths)
# stem words
text = [[stemmer.stem(word) for word in sentence] for sentence in text]
# # Flatten list of lists
# text = [word for sentence in text for word in sentence]
# # Stem words
# text = [stemmer.stem(word) for word in text]

# print(text)
# Hyperparameters
vocab = set([word for sentence in text for word in sentence] + ["<S>", "</S>", "<UNK>"])
vocab_size = len(vocab)
embedding_size = 256
window_size = 4
batch_size = 64
num_epochs = 5
print("Vocab size: ", vocab_size)

Vocab size:  15176


In [76]:
print(text[:5])

[['قول', 'او', 'قطع', 'اول', 'يده', 'الخ', 'قال', 'زركش'], ['ابن', 'عرف', 'قول', 'لفظ', 'قضي', 'كإنكار', 'غير', 'حدث', 'سلم', 'وجب', 'ما', 'علم', 'وجب', 'من', 'دين', 'ضرر', 'كإلقاء', 'صحف', 'قذر', 'وشد', 'زنر', 'ابن', 'عرف', 'قول', 'ابن', 'شاس', 'او', 'فعل', 'تضم', 'هو', 'كلبس', 'زنر', 'إلقاء', 'صحف', 'في', 'صرح', 'نجس', 'سجد', 'صنم', 'نحو', 'ذلك', 'سحر', 'حمد', 'قول', 'الك', 'أصحاب', 'ان', 'سحر', 'كفر', 'بلل', 'على', 'قال', 'الك', 'هو', 'زنديق', 'اذا', 'عمل', 'سحر', 'نفس', 'قتل', 'ولم', 'ستب'], ['قول', 'عدم', 'ما', 'علق', 'الخ', 'اي', 'وصة', 'قول', 'ما', 'مر', 'اي', 'قبل', 'قول', 'متن', 'لغت', 'ولو', 'قصر', 'على', 'اوص', 'له', 'بشة', 'او', 'عطو', 'شاة', 'ولا', 'غنم', 'له', 'عند', 'موت', 'هل', 'بطل', 'وصة', 'او', 'شرى', 'له', 'شاة', 'ؤخذ', 'من', 'قول', 'اتي', 'كما', 'لو', 'لم', 'يقل', 'من', 'الي', 'ولا', 'من', 'غنم', 'انه', 'لا', 'بطل', 'عبر', 'كنز', 'ولو', 'لم', 'يقل', 'من', 'الي', 'ولا', 'من', 'غنم', 'لم', 'يتع', 'غنم', 'ان', 'كانت', 'نهت', 'ا', 'ه', 'سم', 'قول', 'عطى', 'وحد', 'منها'

In [77]:
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for i, word in enumerate(vocab)}

dataset = CBOWDataset(text, word2idx, window_size, lengths)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [78]:
model = CBOW(vocab_size, embedding_size).to(device)
# criterion = nn.NLLLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [79]:
for epoch in range(num_epochs):
    torch.cuda.empty_cache()  # Clear CUDA cache
    for i, (context, target) in enumerate(dataloader):
        log_probs = model(context)
        loss = criterion(log_probs, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Epoch: %d, Iteration: %d, Loss: %0.4f out of %d" % (epoch, i, loss, len(dataloader)))


Epoch: 0, Iteration: 0, Loss: 11.2063 out of 34501
Epoch: 0, Iteration: 100, Loss: 10.4997 out of 34501
Epoch: 0, Iteration: 200, Loss: 10.1083 out of 34501
Epoch: 0, Iteration: 300, Loss: 9.8875 out of 34501
Epoch: 0, Iteration: 400, Loss: 8.9709 out of 34501
Epoch: 0, Iteration: 500, Loss: 8.7906 out of 34501
Epoch: 0, Iteration: 600, Loss: 8.8597 out of 34501
Epoch: 0, Iteration: 700, Loss: 8.1761 out of 34501
Epoch: 0, Iteration: 800, Loss: 8.5293 out of 34501
Epoch: 0, Iteration: 900, Loss: 8.2501 out of 34501
Epoch: 0, Iteration: 1000, Loss: 8.3717 out of 34501
Epoch: 0, Iteration: 1100, Loss: 8.1494 out of 34501
Epoch: 0, Iteration: 1200, Loss: 8.1091 out of 34501
Epoch: 0, Iteration: 1300, Loss: 7.9745 out of 34501
Epoch: 0, Iteration: 1400, Loss: 7.6085 out of 34501
Epoch: 0, Iteration: 1500, Loss: 8.1451 out of 34501
Epoch: 0, Iteration: 1600, Loss: 7.4591 out of 34501
Epoch: 0, Iteration: 1700, Loss: 7.4384 out of 34501
Epoch: 0, Iteration: 1800, Loss: 8.0958 out of 34501
Ep

In [80]:
embedding_weights = model.embeddings.weight.data.cpu().numpy()
np.save("../embedding/embedding_weights.npy", embedding_weights)

In [81]:
embedding_weights = np.load("../embedding/embedding_weights.npy")

In [82]:

def get_word(word):
    return embedding_weights[word2idx[word]]


def get_closest_word(word, n=5):
    word_distance = []
    if word not in word2idx:
        word = "<UNK>"
    word_vec = get_word(word)
    for i, vec in enumerate(embedding_weights):
        distance = np.linalg.norm(vec - word_vec)
        word_distance.append((idx2word[i], distance))
    word_distance = sorted(word_distance, key=lambda k: k[1])[1:n + 1]
    return word_distance


In [83]:
print(get_closest_word(stemmer.stem("محمد")))
print(embedding_weights[word2idx[stemmer.stem("قال")]])
print(len(embedding_weights[word2idx[stemmer.stem("قال")]]))

[('<S>', 19.14136), ('وإبراهيمي', 19.229258), ('واستخلفوه', 19.294115), ('خلن', 19.361084), ('كخر', 19.361275)]
[-0.18959051 -2.1454513   0.7641336  -0.4598308  -0.46626946 -1.7408899
  1.412628   -0.1941928   1.0924621   0.7659512   0.03724959 -3.0771925
  0.9877765  -0.65143436 -1.3068407   0.42978346 -0.1187763  -0.07850742
  1.118411    0.6922342   0.32528445 -0.37620157  1.2121952   0.33371866
  0.30933225  1.9825925   0.22914082 -2.2104561  -0.9881058  -0.34485593
 -0.0082653   1.4057741  -0.04628037 -1.4537004  -0.18389115 -0.25951153
  1.8606279  -0.61686176 -0.6202417   1.0258499  -1.1742886   1.0307592
 -0.65336555  1.2712129  -0.59796566 -0.26553568  0.54542625 -0.11565356
  0.34985822 -0.03302048 -1.0535616   0.15973468 -0.5401474  -0.40274113
 -0.91158545 -1.3131809  -0.5996838   0.03202199  0.24246398 -1.4796752
  0.7466496   0.49126765 -0.5448028   0.9052195  -0.04733211  0.36092284
 -0.7268986  -0.86234206 -0.47963944  2.1093397   0.6541289  -0.7741962
  0.2521864   2.0