In [1]:
!pip install pyarabic



In [2]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge, Multiply
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence
import keras

import urllib.request
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf
from pyarabic import araby

Using TensorFlow backend.


In [3]:
!wget https://raw.githubusercontent.com/zaidalyafeai/ARBML/master/datasets/Wiki/wiki_ar.txt
wiki_path = 'wiki_ar.txt'

--2019-06-19 10:09:00--  https://raw.githubusercontent.com/zaidalyafeai/ARBML/master/datasets/Wiki/wiki_ar.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 59054425 (56M) [text/plain]
Saving to: ‘wiki_ar.txt.1’


2019-06-19 10:09:01 (214 MB/s) - ‘wiki_ar.txt.1’ saved [59054425/59054425]



## Stop Words

Remove stope words

In [4]:
import nltk
nltk.download('stopwords')
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

remove_as_well = set(['ان', 'او', 'ا', 'لا', 'في', 'على', 'الى', 'اي', 'م', 'تكون', 'كان', 'من', 'اذا', 'مع'])
arb_stopwords |= remove_as_well

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Process the Data

In [0]:
# Read the data into a list of strings.
def read_data(filename):
    with open(filename, 'r') as f:
      corpus = f.read()
      corpus = corpus.replace('\n',' ')
      corpus = araby.strip_tashkeel(corpus)
    return corpus.split()


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        if word not in arb_stopwords:
          dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    vocabulary = read_data(wiki_path)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary


## Create the dictionaries

In [0]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])
print(len(data))
vocab_size = len(dictionary)

## Sample 3-Grams

In [7]:
#setup parameters
window_size = 3
vector_dim = 256

#sample data according to the window size 
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)

word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[829, 4160], [20, 587], [9363, 1378], [1145, 2296], [343, 4073], [8536, 7311], [7425, 195], [3932, 750], [5868, 13], [5203, 6601]] [1, 1, 1, 1, 0, 1, 1, 1, 1, 0]


## Model

In [8]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

# setup a cosine similarity operation which will be output in a secondary model
similarity = keras.layers.Dot(axes = 0, normalize = True)([target, context])

# now perform the dot product operation to get a similarity measure
dot_product = keras.layers.Dot(axes = 1)([target, context])

dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

# create the primary training model
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(inputs=[input_target, input_context], outputs=similarity)

W0619 10:12:38.149486 139925669316480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0619 10:12:38.164864 139925669316480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0619 10:12:38.169638 139925669316480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0619 10:12:38.222889 139925669316480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0619 10:12:38.242461 139925669316480 deprecation_wrappe

In [0]:
valid_examples = np.array([dictionary[text] for text in ['ابريل', 'المانيا', 'خمسة']])
valid_size = len(valid_examples)

class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

## Train

In [0]:
def get_batch(idx, word_target, word_context, labels, batch_size = 32):
  arr_1 = word_target[idx*batch_size: (idx + 1)* batch_size]
  arr_2 = word_context[idx*batch_size: (idx + 1)* batch_size]
  arr_3 = labels[idx*batch_size: (idx + 1)* batch_size]
  return arr_1, arr_2, arr_3 

In [11]:
batch_size = 256
avg_loss = 0.0
epochs = 1
for cnt in range(epochs):
    num_batches = len(labels)// batch_size
    
    for idx in range(0, num_batches):
      batch_idx = np.random.randint(0, num_batches)
      arr_1, arr_2, arr_3  = get_batch(batch_idx, word_target, word_context, labels, batch_size = batch_size)
      loss = model.train_on_batch([arr_1, arr_2], arr_3)
      avg_loss += loss
            
      if idx % 1000 == 0 and idx != 0:
        print("Iteration {}, loss={}".format(cnt, avg_loss/1000))
        avg_loss = 0.0
      if idx % 10000 == 0:
        sim_cb.run_sim()
        print(" ")

W0619 10:12:57.568955 139925669316480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Nearest to ابريل: قلت, اتفاقيات, الشعرية, القطري, عرضة, العناية, العاملين, تتوفر,
Nearest to المانيا: مبكرة, يملكون, مجزرة, المستويات, تصميم, الفقري, مروان, الماعز,
Nearest to خمسة: الثوري, البلاشفة, تمنح, التركيز, انها, التقليد, قابلية, عائشة,
 
Iteration 0, loss=0.6854368621706962
Iteration 0, loss=0.6308442072570324
Iteration 0, loss=0.5786757318675518
Iteration 0, loss=0.5433163019865751
Iteration 0, loss=0.5185977192521095
Iteration 0, loss=0.49843697002530096
Iteration 0, loss=0.48290011164546015
Iteration 0, loss=0.4693636109381914
Iteration 0, loss=0.45512205351144075
Iteration 0, loss=0.4394383922591805
Nearest to ابريل: عام, العام, مايو, خلال, فبراير, بروسيا, اكتوبر, الاولى,
Nearest to المانيا: البلاد, كانت, معظم, عام, شعوب, و, مسلمين, خسارة,
Nearest to خمسة: تسعة, المسافرين, حوالي, و, مدينة, اربعة, مسجدا, ثلاث,
 
Iteration 0, loss=0.4312197460308671
Iteration 0, loss=0.4162627774998546
Iteration 0, loss=0.4210812357440591
Iteration 0, loss=0.4116187410354614
Iteration 0, los

## Prediction

In [0]:
def find_top_k(word, top_k = 8):
    valid_word_idx = dictionary[word]
    sim = get_similar(valid_word_idx)
    nearest = (-sim).argsort()[1:top_k + 1]
    log_str = word + ':'
    for k in range(top_k):
        close_word = reverse_dictionary[nearest[k]]
        log_str = '%s  %s -' % (log_str, close_word)
    print(log_str)

def get_similar(valid_word_idx):
    sim = np.zeros((vocab_size,))
    in_arr1 = np.zeros((1,))
    in_arr2 = np.zeros((1,))
    in_arr1 = np.array([valid_word_idx]*vocab_size)
    in_arr2 = np.array(range(vocab_size))
    out = validation_model.predict_on_batch([in_arr1, in_arr2])
    out = out.reshape((vocab_size,))

    return out

In [62]:
find_top_k('سبعة')

سبعة:  ثمانية -  خمسة -  آلاف -  سنة -  اثني -  ثلاثة -  اربعة -  الفا -


## Save

In [0]:
validation_model.save('keras.h5')

In [0]:
#save the dictionary
import csv
def create_csv(file, dict):
    with open(file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        for key in dict.keys():
            writer.writerow([key,dict[key]])


In [0]:
create_csv('idx2word.csv', reverse_dictionary)
create_csv('word2idx.csv', dictionary)

https://adventuresinmachinelearning.com/word2vec-keras-tutorial/