In [None]:
# imports
import re
import os
import numpy as np
import tensorflow as tf

In [None]:
from google.colab import drive
drive._mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the dataset (Google Colab Environment)
! wget "https://www.cs.cmu.edu/%7Eark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip"
! unzip "/content/AQMAR_Arabic_NER_corpus-1.0.zip" -d "/content/corpus"

--2022-12-09 16:44:10--  https://www.cs.cmu.edu/%7Eark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7815886 (7.5M) [application/zip]
Saving to: ‘AQMAR_Arabic_NER_corpus-1.0.zip.1’


2022-12-09 16:44:16 (1.38 MB/s) - ‘AQMAR_Arabic_NER_corpus-1.0.zip.1’ saved [7815886/7815886]

Archive:  /content/AQMAR_Arabic_NER_corpus-1.0.zip
  inflating: /content/corpus/Atom.txt  
  inflating: /content/corpus/Christiano_Ronaldo.txt  
  inflating: /content/corpus/Computer.txt  
  inflating: /content/corpus/Computer_Software.txt  
  inflating: /content/corpus/Crusades.txt  
  inflating: /content/corpus/Damascus.txt  
  inflating: /content/corpus/Enrico_Fermi.txt  
  inflating: /content/corpus/Football.txt  
  inflating: /content/corpus/Ibn_Tolun_Mosque.txt  
  inflating: /content/corpus/Imam_Hussein_Shrine.txt  
  inflat

In [None]:
# Entity Cleaner: Unites entity tags and fixs misspellings 
def tags_cleaner(entity):
  entity = re.sub('\n','',entity) # Remove the newline (\n)
  if entity in ['B-LOC', 'B-MIS', 'B-ORG','B-PER','I-LOC','I-MIS','I-ORG','I-PER','O']:
    return entity
  elif entity in ['B-MIS0','B-MIS1', 'B-MIS2', 'B-MIS3', 'B-MIS-1','B-MIS-2', 'B-MIS1`', 'B-MISS1']:
    return 'B-MIS'
  elif entity in ['I-MIS0','I-MIS1', 'I-MIS2', 'I-MIS3']:
    return 'I-MIS'
  elif entity in ['B-ENGLISH', 'B-SPANISH', 'OO', 'IO']:
    return 'O'
  elif entity == 'I--ORG':
    return 'I-ORG'
  else:
    print('Error with entity:', entity)

In [None]:
# Clean/Normalize Arabic Text
def clean_str(text):
  search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","?","؟"]
  replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
  # Remove tashkeel
  p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
  text = re.sub(p_tashkeel,"", text)
    
  # Remove longation
  p_longation = re.compile(r'(.)\1+')
  subst = r"\1\1"
  text = re.sub(p_longation, subst, text)
    
  text = text.replace('وو', 'و')
  text = text.replace('يي', 'ي')
  text = text.replace('اا', 'ا')
  for i in range(0, len(search)):
    text = text.replace(search[i], replace[i])
  # Trim    
  text = text.strip()
  return text

In [None]:
# Remove empty strings or strings that contains spaces only from sentences
def re_clean(old_sentence, old_tags):
  space_regex = re.compile("\s+")
  new_sentence = []
  new_tags = []
  for j in range(len(old_sentence)):
    # add word if not empty and doesn't contain spaces only
    if old_sentence[j]!="" and space_regex.match(old_sentence[j])==None:
      new_sentence.append(old_sentence[j])
      new_tags.append(old_tags[j])
  
  return new_sentence, new_tags


In [None]:
# Read sentences
sentences = [] 
tags = []
vocab = set()

corpus_path = "/content/corpus/"
for file in os.listdir(corpus_path):
  if file.endswith('.txt'): # Get txt files only
    print('Processing:', file)
    topic = open(corpus_path+file)
    sentence = []
    entity = []
    for line in topic.readlines():
      if line == '\n': # Sentence end
        recleaned = re_clean(sentence, entity)
        sentences.append(recleaned[0].copy())
        tags.append(recleaned[1].copy())
        sentence.clear()
        entity.clear()
      else:
        line = line.split(sep=' ')
        clean_word = clean_str(line[0])       # Cleaning word
        vocab.add(clean_word)                 # Add word to the vocab
        sentence.append(clean_word)           # Add the word
        entity.append(tags_cleaner(line[1]))  # Clean and add entity


print('Done [Sentences:', len(sentences), ', Tags:', len(tags), ', Unique Words:', len(vocab))

Processing: Linux.txt
Processing: Solaris.txt
Processing: Crusades.txt
Processing: Enrico_Fermi.txt
Processing: Internet.txt
Processing: Richard_Stallman.txt
Processing: Raul_Gonzales.txt
Processing: Imam_Hussein_Shrine.txt
Processing: Christiano_Ronaldo.txt
Processing: Islamic_History.txt
Processing: Football.txt
Processing: Light.txt
Processing: Portugal_football_team.txt
Processing: Damascus.txt
Processing: Nuclear_Power.txt
Processing: Ummaya_Mosque.txt
Processing: Real_Madrid.txt
Processing: Islamic_Golden_Age.txt
Processing: Atom.txt
Processing: X_window_system.txt
Processing: Razi.txt
Processing: Summer_Olympics2004.txt
Processing: Computer_Software.txt
Processing: Ibn_Tolun_Mosque.txt
Processing: Physics.txt
Processing: Periodic_Table.txt
Processing: Soccer_Worldcup.txt
Processing: Computer.txt
Done [Sentences: 2687 , Tags: 2687 , Unique Words: 17478


In [None]:
# Make a mapping betwween words and their IDs
word2id = {word:id for  id, word in enumerate(vocab)}
id2word = {id:word for  id, word in enumerate(vocab)}
     

In [None]:
from tensorflow.keras.utils import to_categorical

# Sentence encoder
def encode_sentence(old_sentence):
  encoded_sentence = []
  for word in old_sentence:
    try:
      encoded_sentence.append(word2id[word])
      print(word2id[word])
    except KeyError:
      encoded_sentence.append(0) # A dummy digit for out of vocab

  return encoded_sentence

# Encode Tags
tags_encoding = {
    'B-LOC':0,
    'B-MIS':1,
    'B-ORG':2,
    'B-PER':3,
    'I-LOC':4,
    'I-MIS':5,
    'I-ORG':6,
    'I-PER':7,
    'O':8
  }
def encode_tags(old_tags):
  new_tags = [tags_encoding[tag] for tag in old_tags]
  new_tags = to_categorical(y = new_tags, num_classes=9)
  return new_tags

In [None]:
# Encoding
sentences_encoded = []
tags_encoded = []

for i in range(len(sentences)):
  sentences_encoded.append(encode_sentence(sentences[i]))
  tags_encoded.append(encode_tags(tags[i]))
     

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
628
4960
7437
3802
4035
13800
9776
6195
17184
9872
3801
16369
16656
3407
14129
9776
6195
13300
7311
3801
13300
511
3801
13300
4084
6389
7344
8237
1379
10904
12147
5991
1038
4084
14946
1038
3897
10301
1379
3384
13173
9113
5852
1745
6193
1038
4499
8450
12819
6866
14753
10430
8652
11124
7519
5852
15543
13387
1410
2505
10151
2241
6247
16310
6074
10721
3044
5604
14292
12856
14069
6627
13861
9107
10430
8652
3306
15906
5996
14085
10771
12794
11870
7576
3044
5604
16970
12856
17241
4664
15543
6627
13861
13627
3802
17231
1410
6791
3307
15968
7260
167
1288
8570
10771
11870
15777
628
7260
16183
10822
17225
6193
7135
15906
4960
525
9733
10047
4176
11956
9733
10047
6193
4213
4369
10506
4
478
1424
3343
746
7359
15206
6654
10799
11264
16183
10822
14683
6193
5786
6450
8652
9420
15172
13317
10220
16745
8840
514
3369
2521
2240
3502
6193
6241
13300
10500
5172
5369
5873
16183
10047
2301
12330
6193
5741
3886
4162
4838
8930
198
11782
5991
15305

In [None]:
from keras_preprocessing.sequence import pad_sequences

# Padding
MAX_SEQUENCE_LENGTH = 40

sentences_padded = pad_sequences(sequences = sentences_encoded, 
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32', 
                                 padding='post',
                                 truncating='post',
                                 value = 0)
tags_padded = pad_sequences(sequences = tags_encoded, 
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32', 
                                 padding='post',
                                 truncating='post',
                                 value = np.array([0., 0., 0., 0., 0., 0., 0., 0., 1.]))
     

In [None]:
from sklearn.model_selection import train_test_split

# Splitting data
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences_padded, 
                                                                              tags_padded, 
                                                                              train_size=0.8, 
                                                                              random_state=42)
     

In [None]:
# Download AraVec (Word2Vec Model) by Abu Bakr Soliman, Kareem Eissa, and Samhaa R.El-Beltagy.
! wget "https://archive.org/download/aravec2.0/wiki_cbow_300.zip"
! unzip "/content/wiki_cbow_300.zip" -d "/content/word2vec_model"

--2022-12-09 16:44:50--  https://archive.org/download/aravec2.0/wiki_cbow_300.zip
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip [following]
--2022-12-09 16:44:50--  https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip
Resolving ia803107.us.archive.org (ia803107.us.archive.org)... 207.241.232.157
Connecting to ia803107.us.archive.org (ia803107.us.archive.org)|207.241.232.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 364888893 (348M) [application/zip]
Saving to: ‘wiki_cbow_300.zip’


2022-12-09 16:46:36 (3.32 MB/s) - ‘wiki_cbow_300.zip’ saved [364888893/364888893]

Archive:  /content/wiki_cbow_300.zip
  inflating: /content/word2vec_model/wikipedia_cbow_300  
  inflating: /content/word2vec_model/wikipedia_cbow_300.trainables.syn1neg.npy  

In [None]:
import gensim

# Load the Word2Vec model
weights_path = "/content/word2vec_model/wikipedia_cbow_300"
araVec = gensim.models.Word2Vec.load(weights_path)

# Testing
most_similar = araVec.wv.most_similar( "محمد" )
for term, score in most_similar:
	print(term, score)

لمحمد 0.726012110710144
احمد 0.7142194509506226
عبدالرحمن 0.6745274066925049
ابراهيم 0.6723851561546326
مهدي 0.6686975955963135
محمود 0.664846658706665
يحي 0.637116551399231
اسماعيل 0.6307213306427002
حموده 0.6287057995796204
عبدالحميد 0.6267551183700562


In [None]:
# Create an embedding matrix for the embedding layer
num_words = len(vocab)
embed_size, = araVec['محمود'].shape
embedding_matrix = np.zeros(shape=(num_words, embed_size))

for word, id in word2id.items():
  try:
    embedding_matrix[id] = araVec[word]
  except KeyError:
    embedding_matrix[id] = np.zeros(embed_size)

embedding_matrix.shape

  embed_size, = araVec['محمود'].shape
  embedding_matrix[id] = araVec[word]


(17478, 300)

In [None]:
from tensorflow.keras.layers import LSTM, Input, Dense, Embedding, TimeDistributed
from tensorflow.keras.models import Model, Sequential

tf.keras.backend.clear_session() # Makes sure old model was deleted if exists

lstm_model = Sequential()
# Adding Layers
lstm_model.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))
lstm_model.add(Embedding(input_dim = len(vocab),              # Vocabulary Size (number of unique words for training)
                        output_dim = embed_size,              # Length of the vector for each word (embedding dimension)
                        input_length = MAX_SEQUENCE_LENGTH,   # Maximum length of a sequence
                        weights = [embedding_matrix],         # Send the needed AraVec Weights
                        trainable = False))

lstm_model.add(LSTM(units = embed_size, 
                    return_sequences=True,
                    dropout=0.5, 
                    recurrent_dropout=0.5))
lstm_model.add(TimeDistributed(Dense(9, activation='softmax')))

# Compile the model
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999), 
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
lstm_model.summary()
     

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 300)           5243400   
                                                                 
 lstm (LSTM)                 (None, 40, 300)           721200    
                                                                 
 time_distributed (TimeDistr  (None, 40, 9)            2709      
 ibuted)                                                         
                                                                 
Total params: 5,967,309
Trainable params: 723,909
Non-trainable params: 5,243,400
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


In [None]:
lstm_model.fit(train_sentences, 
               train_labels, 
               validation_split=0.15, 
               batch_size = 10,
               epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fab36ff1700>

In [None]:
lstm_model.evaluate(test_sentences, test_labels)



[0.1594718098640442, 0.9503717422485352]

In [None]:
lstm_model.save('/content/drive/MyDrive/Digified/my_model2.h5' )

In [None]:
import keras
lstm_model = keras.models.load_model('/content/drive/MyDrive/Digified/my_model2.h5')

In [None]:
def lstm_predict(sentence:str):
  sentence = sentence.split(sep=' ')
  # Keeping track of words so not to process 40 words every time
  word_count = len(sentence) 
  
  # Clean sentence
  ready_sentence = [clean_str(word) for word in sentence]
  
  # Encode sentence
  ready_sentence = encode_sentence(ready_sentence)
  
  # Padding sentence
  ready_sentence = pad_sequences(sequences = [ready_sentence], 
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32', 
                                 padding='post',
                                 truncating='post',
                                 value = 0)
  
  # Predict and return actual words only
  predictions = lstm_model.predict(ready_sentence)[0][0:word_count]
  print(predictions)

  i = 0
  arr_people=[]
  for prediction in predictions:
    tags_onehot = {
      'B-LOC':np.array([1., 0., 0., 0., 0., 0., 0., 0., 0.]),
      'B-MIS':np.array([0., 1., 0., 0., 0., 0., 0., 0., 0.]),
      'B-ORG':np.array([0., 0., 1., 0., 0., 0., 0., 0., 0.]),
      'B-PER':np.array([0., 0., 0., 1., 0., 0., 0., 0., 0.]),
      'I-LOC':np.array([0., 0., 0., 0., 1., 0., 0., 0., 0.]),
      'I-MIS':np.array([0., 0., 0., 0., 0., 1., 0., 0., 0.]),
      'I-ORG':np.array([0., 0., 0., 0., 0., 0., 1., 0., 0.]),
      'I-PER':np.array([0., 0., 0., 0., 0., 0., 0., 1., 0.]),
      'O':np.array([0., 0., 0., 0., 0., 0., 0., 0., 1.]),
    }
    tags_scores = {
      'B-LOC':0,
      'B-MIS':0,
      'B-ORG':0,
      'B-PER':0,
      'I-LOC':0,
      'I-MIS':0,
      'I-ORG':0,
      'I-PER':0,
      'O':0
    }
    for tag in list(tags_onehot.keys()):
      tags_scores[tag] = np.linalg.norm(tags_onehot[tag] - prediction)
    #print(min(tags_scores, key=tags_scores.get))
    arr_people.append(min(tags_scores, key=tags_scores.get))
  
    i+=1
  print(arr_people) 
  if word_count ==3:
    for i in arr_people:
      if i!= 'I-PER' and i!='B-PER':
        return 'F'
      else:
        continue
    return 'T'
  else:
    return 'f'    



In [None]:
(lstm_predict("محمد محمود أحمد"))

3
['محمد', 'محمود', 'احمد']
9404
14236
11007
[9404, 14236, 11007]
[[ 9404 14236 11007     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]
[[0.03239435 0.01555185 0.02181163 0.3314773  0.0900664  0.03147415
  0.03544654 0.38805044 0.05372734]
 [0.01882921 0.00302625 0.00664241 0.23979114 0.037627   0.01009314
  0.00853139 0.6442991  0.03116038]
 [0.01427439 0.00079548 0.00267641 0.24229558 0.01739585 0.00359194
  0.00325093 0.6994104  0.01630902]]
['I-PER', 'I-PER', 'I-PER']


'T'