# IMPORTS

In [1]:
!pip install pyarabic

Collecting pyarabic
[?25l  Downloading https://files.pythonhosted.org/packages/7b/e2/46728ec2f6fe14970de5c782346609f0636262c0941228f363710903aaa1/PyArabic-0.6.10.tar.gz (108kB)
[K     |███                             | 10kB 19.4MB/s eta 0:00:01[K     |██████                          | 20kB 25.1MB/s eta 0:00:01[K     |█████████                       | 30kB 29.6MB/s eta 0:00:01[K     |████████████                    | 40kB 30.0MB/s eta 0:00:01[K     |███████████████                 | 51kB 31.6MB/s eta 0:00:01[K     |██████████████████              | 61kB 33.6MB/s eta 0:00:01[K     |█████████████████████           | 71kB 28.9MB/s eta 0:00:01[K     |████████████████████████        | 81kB 30.1MB/s eta 0:00:01[K     |███████████████████████████     | 92kB 31.2MB/s eta 0:00:01[K     |██████████████████████████████  | 102kB 32.3MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 32.3MB/s 
[?25hBuilding wheels for collected packages: pyarabic
  Building wheel

In [2]:
!wget 'https://raw.githubusercontent.com/zaidalyafeai/ARBML/master/datasets/Poem Meters/baits.zip'
!unzip baits.zip

--2021-05-31 05:50:53--  https://raw.githubusercontent.com/zaidalyafeai/ARBML/master/datasets/Poem%20Meters/baits.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2267882 (2.2M) [application/zip]
Saving to: ‘baits.zip’


2021-05-31 05:50:53 (44.6 MB/s) - ‘baits.zip’ saved [2267882/2267882]

Archive:  baits.zip
   creating: final_baits/
  inflating: final_baits/train.txt   
  inflating: final_baits/labels.txt  
  inflating: final_baits/test.txt    


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Assesting libraries
import numpy as np
import pandas as pd
import re
from pyarabic import araby
from keras.preprocessing.sequence import pad_sequences
from keras.layers import InputLayer
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional, BatchNormalization, Flatten, Reshape,LSTM
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [5]:
#Constants
TOKENIZER_WORD_NUM = 50000
#Files
DATA_TRAIN_PATH = '/content/drive/MyDrive/dataset/train.csv'
DATA_LABELS_PATH = '/content/drive/MyDrive/dataset/labels.txt'
#Values
MAX_VOCAB_SIZE = 50000
EMBEDDING_DIM = 300
DELTA = 1e-12
BETA = 1e-6
GAMMA = 1e-3

# Dataset Importing

In [65]:
data = pd.read_csv(DATA_TRAIN_PATH)
data.head()

Unnamed: 0,labels,data
0,8,أَنا الفقير وباللَه العظيم غني # لئن فقدتك في ...
1,10,وَلوعاً بِيُمنَى نَمْنَمَتْها حَدِيقَةٌ # نَزْ...
2,11,فيا منْ لم أزلْ أحظى لديه # بفضلٍ جامعٍ بابَ ا...
3,9,وَسَلامٌ عَلَى ضَرِيحِكَ مَا أَهْ # دَتْ شَذَا...
4,8,أمِنْتُ فقري لما قُلتُ عن ثِقَةٍ # أنْ لا جواد...


In [66]:
#some raw open source dataset with 1.5 million verses on github that i cleaned in an external notebook file
extra = pd.read_csv('/content/drive/MyDrive/dataset/extra_train.csv') 

data = pd.concat([data,extra],ignore_index=True)
data = data.drop_duplicates()
data = data.reset_index().drop(labels=["index"], axis=1)

In [67]:
data

Unnamed: 0,labels,data
0,8,أَنا الفقير وباللَه العظيم غني # لئن فقدتك في ...
1,10,وَلوعاً بِيُمنَى نَمْنَمَتْها حَدِيقَةٌ # نَزْ...
2,11,فيا منْ لم أزلْ أحظى لديه # بفضلٍ جامعٍ بابَ ا...
3,9,وَسَلامٌ عَلَى ضَرِيحِكَ مَا أَهْ # دَتْ شَذَا...
4,8,أمِنْتُ فقري لما قُلتُ عن ثِقَةٍ # أنْ لا جواد...
...,...,...
1469362,9,هي أغلى ما أنشأ اللَّه في الدنيا # وأحلى قصيدة...
1469363,9,هي أغرودة الأغاريد تنساب # كحلم يغشى الجفون ال...
1469364,9,هي شلال بهجة وبهاء # يتداعى وجداً ويخفق حسنا
1469365,9,هي حلم الهوى ومنطلقي الباقي # يدك الحدود سجناً...


In [68]:
labels =  pd.read_csv(DATA_LABELS_PATH,
                   sep="\n",
                   header=None,
                   engine='python').values

In [69]:
def get_label(index,labels):
  return labels[index][0]

In [70]:
for i in data.iloc[:10]['data']:
  print(i,end='\n---------------------------------------------------------\n')

أَنا الفقير وباللَه العظيم غني # لئن فقدتك في أَشياء طلتُ بها
---------------------------------------------------------
وَلوعاً بِيُمنَى نَمْنَمَتْها حَدِيقَةٌ # نَزْهَدُ أَحْدَاق الوَرَى فِي الحَدَائِقِ
---------------------------------------------------------
فيا منْ لم أزلْ أحظى لديه # بفضلٍ جامعٍ بابَ الزِّياده
---------------------------------------------------------
وَسَلامٌ عَلَى ضَرِيحِكَ مَا أَهْ # دَتْ شَذَاهَا حَدِيقَةٌ غَلْبَاءُ
---------------------------------------------------------
أمِنْتُ فقري لما قُلتُ عن ثِقَةٍ # أنْ لا جواد سوى السلطان مسعودِ
---------------------------------------------------------
كأنه بضمير الركضِ يضْربهُ # يدنو عليه بعيد الأرض مُرتكضاً
---------------------------------------------------------
عَزَّ مَن أَمدحُهُ في رَجبٍ # فَأَنا الأَخْرَسُ والشَّهْرُ الأَصَمْ
---------------------------------------------------------
إِذا ما عاجِزٌ رَثَّت قُواهُ # رَأى وَطءَ الفِراشِ لَهُ فَناما
---------------------------------------------------------
فإنما أنا 

In [71]:
# apply label
data['labels_text']=data['labels'].apply(lambda x: get_label(x,labels))

In [72]:
data

Unnamed: 0,labels,data,labels_text
0,8,أَنا الفقير وباللَه العظيم غني # لئن فقدتك في ...,baseet
1,10,وَلوعاً بِيُمنَى نَمْنَمَتْها حَدِيقَةٌ # نَزْ...,taweel
2,11,فيا منْ لم أزلْ أحظى لديه # بفضلٍ جامعٍ بابَ ا...,wafer
3,9,وَسَلامٌ عَلَى ضَرِيحِكَ مَا أَهْ # دَتْ شَذَا...,khafeef
4,8,أمِنْتُ فقري لما قُلتُ عن ثِقَةٍ # أنْ لا جواد...,baseet
...,...,...,...
1469362,9,هي أغلى ما أنشأ اللَّه في الدنيا # وأحلى قصيدة...,khafeef
1469363,9,هي أغرودة الأغاريد تنساب # كحلم يغشى الجفون ال...,khafeef
1469364,9,هي شلال بهجة وبهاء # يتداعى وجداً ويخفق حسنا,khafeef
1469365,9,هي حلم الهوى ومنطلقي الباقي # يدك الحدود سجناً...,khafeef


# Preprocessing

In [73]:
char2index = {' ': 1,'#': 2,'ء': 3,'آ': 4,'أ': 5,'ؤ': 6,'إ': 7,'ئ': 8,'ا': 9,'ب': 10,'ة': 11,'ت': 12,'ث': 13,'ج': 14,'ح': 15,'خ': 16,'د': 17,'ذ': 18,'ر': 19,'ز': 20,'س': 21,'ش': 22,'ص': 23,'ض': 24,'ط': 25,'ظ': 26,'ع': 27,'غ': 28,'ف': 29,'ق': 30,'ك': 31,'ل': 32,'م': 33,'ن': 34,'ه': 35,'و': 36,'ى': 37,'ي': 38}

In [74]:
def cleanSentence(string):

  string = araby.strip_tashkeel(string)
  string = re.sub(r'[…1423567890"–_!()*-.ـ:=o«»;\[\]؛,،~?؟\u200f\ufeffـ\u200d\u200c\uf020\uf03a\uf02d\uf02e]*','',string)
  string = re.sub(r'[abcdefghijklmnopqrstuvwx×yzABCDEFGHIJKLMNOPQRSTUVWXYZ]*','',string)
  string = re.sub(r"\s+$", '',string)
  string = re.sub(r"^\s+", '',string)

  return string

In [75]:
def tokenize(string):
  return [char2index[char] for char in string]

In [76]:
samples_data = data.sample(n=1000000, random_state=41)

In [77]:
tokenized_matrix = pad_sequences(samples_data['data'].apply(lambda x: tokenize(cleanSentence(x))).values,padding='post', value=0, maxlen = 100)
tokenized_matrix

array([[15, 21, 10, ...,  0,  0,  0],
       [36, 12, 23, ...,  0,  0,  0],
       [36, 21, 38, ...,  0,  0,  0],
       ...,
       [33, 15, 33, ...,  0,  0,  0],
       [ 5, 24, 15, ...,  0,  0,  0],
       [34, 35, 36, ...,  0,  0,  0]], dtype=int32)

# Modelling

In [78]:
X_train, X_valid , y_train, y_valid = train_test_split(tokenized_matrix, samples_data['labels'].values, test_size = 0.10, random_state = 41)

In [79]:
len(X_train)

900000

In [None]:
model = Sequential()
model.add(Input((100,)))
model.add(Embedding(len(char2index)+1, 512))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(len(labels), activation = 'softmax'))

# batch_size = 64 # Batch size for training.
# epochs = 100  # Number of epochs to train for.
# latent_dim = 64  # Latent dimensionality of the encoding space.

# model = Sequential()
# model.add(Embedding(len(char2index)+1, 32, input_length=100, mask_zero=True))
# model.add(Bidirectional(LSTM(latent_dim, input_shape=(None,len(char2index)), return_sequences=True,
#             dropout=0.1, recurrent_dropout=0.3),
#             merge_mode='concat'))
# model.add(Bidirectional(LSTM(latent_dim, return_sequences=True,
#             dropout=0.1, recurrent_dropout=0.3),
#             merge_mode='concat'))
# model.add(Bidirectional(LSTM(latent_dim, return_sequences=True,
#             dropout=0.1, recurrent_dropout=0.3),
#             merge_mode='concat'))
# model.add(Bidirectional(LSTM(latent_dim,
#             dropout=0.1, recurrent_dropout=0.3),
#             merge_mode='concat'))
# model.add(Dense(len(labels), activation='softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])



In [None]:
model(tf.zeros((10, 100))).shape

TensorShape([10, 14])

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 512)          19968     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 512)          1182720   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100, 512)          1182720   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 512)               1182720   
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 14)               

In [None]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001)]
callbacks += [tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/dataset/final_model.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]
callbacks += [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=5)]

# checkpoint_path = "cp.ckpt"

# callbacks_list = [
#     tf.keras.callbacks.EarlyStopping(
#         monitor='val_accuracy',
#         patience=5),
#     tf.keras.callbacks.ModelCheckpoint(
#         'full_verse.h5',
#         mode='max',
#         save_best_only=True,
#         monitor='val_accuracy'),
# ]

In [None]:
model.fit(X_train, y_train, validation_data= (X_valid, y_valid), epochs = 50, batch_size= 128, shuffle = True, callbacks=callbacks)

Epoch 1/50

Epoch 00001: val_accuracy improved from -inf to 0.95109, saving model to /content/drive/MyDrive/dataset/final_model.h5
Epoch 2/50

Epoch 00002: val_accuracy improved from 0.95109 to 0.95486, saving model to /content/drive/MyDrive/dataset/final_model.h5
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.95486
Epoch 4/50

Epoch 00004: val_accuracy did not improve from 0.95486
Epoch 5/50

# Testing

In [86]:
model = tf.keras.models.load_model('/content/drive/MyDrive/dataset/final_model.h5')

In [87]:
np.sum( np.argmax(model.predict(X_valid),axis=1) == y_valid) / len(y_valid)

0.95486

In [88]:
test = pd.read_csv('/content/drive/MyDrive/dataset/final_test.csv')
#sample = pd.read_csv('/content/drive/MyDrive/dataset/sample_submission.csv')

In [89]:
test_data = test['data']
test_data.values

array(['أَمّا الوِلايَةُ فَالمَعروفُ خُطبَتُها # وَلَستُ أَقبَلُها عَن بابِكُم بَدَلا',
       'إِذا مامَرَرتَ بِأَهلِ القُبورِ # تَيَقَّنتَ أَنَّكَ مِنهُم غَدا',
       'فعِشْ إنْ قدرتَ قليلَ الحديثِ # قليل الجَليس قليل الخِصام', ...,
       'إِنَّ الدِيارَ الَّتي تُبكى بِمُتَّقِدٍ # غَيرُ الدِيارِ الَّتي تَبكي بِهَطّالِ',
       'فاخترت هذا ولا العشار من كضض # تركت قلبي كريش النسر منتفضا',
       'وَتَرجِعُ لي روحُ الحَياةِ فَإِنَّني # بِنَفسِيَ لَو عايَنتَني لَأَجودُ'],
      dtype=object)

In [90]:
def getResults(data):
  x_test = pad_sequences(data.apply(lambda x: tokenize(cleanSentence(x))).values,padding='post', value=0, maxlen = 100)
  return np.argmax(model.predict(x_test),axis=1) 

In [91]:
test['labels'] = getResults(test_data)
file_submission = test[['id','labels']]
file_submission

Unnamed: 0,id,labels
0,0,8
1,1,2
2,2,2
3,3,10
4,4,11
...,...,...
1995,1995,10
1996,1996,9
1997,1997,8
1998,1998,8


In [92]:
file_submission.to_csv('final_model.csv', index=False)

# HELPERS

## Helper Preprocessing

In [None]:
def clean_text(string):
  """
  Sources 
  ----------
  https://stackabuse.com/using-regex-for-text-manipulation-in-python/
  https://lionbridge.ai/articles/using-natural-language-processing-for-spam-detection-in-emails/
  
  Description
  -----------
  clean text by handling unneeded words

  Parameters
  ----------
  string : str
      string to be processed

  Returns
  -------
  str
      cleaned string
  """
  string = str(string) if type(string) != type('aa') else string
  string = string.lower()
  string = re.sub(r"http\S+", "", string)
  string = re.sub(r"\W", " ", string, flags=re.I)
  string = re.sub(r"[^A-Za-z0-9]", " ", string)
  string = re.sub(r"\'s", " is ", string)
  string = re.sub(r"\'ve", " have ", string)
  string = re.sub(r"can't", "cannot ", string)
  string = re.sub(r"n't", " not ", string)
  string = re.sub(r"I'm", "I am", string)
  string = re.sub(r"\'re", " are ", string)
  string = re.sub(r"\'d", " would ", string)
  string = re.sub(r"\'ll", " will ", string)
  string = re.sub(r"e-mail", "email", string)
  string = re.sub(r" usa ", " america ", string)
  string = re.sub(r" uk ", " england ", string)
  string = re.sub(r"\s+"," ", string, flags = re.I)
  string = string[7:] if re.search(r"^subject", string) else string
  string = re.sub(r"^\s+", "", string)
  string = string[7:] if re.search(r"^re", string) else string
  string = re.sub(r"^\s+", "", string)
  string = re.sub(r"\s+$", "", string)
  string = re.sub(r"\s+[a-zA-Z]\s+", " ", string)

  return string


In [None]:
def tokenizer(text):
  """
  Sources 
  ----------
  
  Description
  -----------
  Converts text to set of encoded words

  Parameters
  ----------
  text : pd.Series
      data to be processed

  Returns
  -------
  pd.Series
      processed data
  """

  tk = Tokenizer(num_words=TOKENIZER_WORD_NUM)
  tk.fit_on_texts(text)
  return pd.Series(tk.texts_to_sequences(text))

In [None]:
def preprocess_text(text,tokenizer=word_tokenize,stops_remove=True,stemmer = SnowballStemmer('english'),stop_words=stopwords.words('english')):
  """
  Sources 
  ----------
  
  Description
  -----------
  Perform the whole preprocessing pipeline for a given text

  Parameters
  ----------
  text : str
      text to be processed

  tokenizer: function (optional)
      used tokenizer

  stemmer: object (optional)
      used stemmer

  stops_remove: list (optional)
      stop words to be removed

  

  Returns
  -------
  str
      processed string
  """
  
  # (1) Cleaning text
  text = clean_text(text)

  # (2) Tokenizing
  text = tokenizer(text)
  
  # (3) Removing stopwords
  text = [word for word in text if word not in stop_words]

  # (4) Stemming
  text = [stemmer.stem(word) for word in text]

  return text


NameError: ignored

In [None]:
def preprocess_df(df,X,y,tokenizer=word_tokenize,stops_remove=True,stemmer = SnowballStemmer('english'),stop_words=stopwords.words('english')):
  """
  Sources 
  ----------
  
  Description
  -----------
  Perform the whole preprocessing pipeline for a given dataframe

  Parameters
  ----------
  text : pd.DataFrame
      dataframe to be processed

  tokenizer: function (optional)
      used tokenizer

  stemmer: object (optional)
      used stemmer

  stops_remove: list (optional)
      stop words to be removed

  

  Returns
  -------
  pd.DataFrame
      processed dataframe
  """
  df_unique = df.drop_duplicates()
  df_ = pd.DataFrame(df_unique[X].apply(lambda x: preprocess_text(x,tokenizer=tokenizer,stops_remove=stops_remove,stemmer=stemmer,stop_words=stop_words)))  
  df_[y] = df_unique[y]
  return df_

NameError: ignored

In [None]:
def prepare_data(df,X,length_col,vocab):
  # (1) Encoding words
  df[X] = df[X].apply(lambda sentence: [vocab[word] for word in sentence])

  # (2) Padding/Truncating rows
  length_stats = df[length_col].describe()
  common_length = length_stats.loc['75%']
  max_length = length_stats.loc['max']
  picked_length = int( max_length if common_length / max_length >= 0.5 else common_length)

  df[X] = df[X].apply(lambda sentence: sentence + [0]*(picked_length-len(sentence)) if len(sentence) < common_length else sentence[:picked_length])

# Helper Feature Extraction

In [None]:
class Vocabulary:
  """
  Description
  -----------
  - Keeps the words in dataset with count
  - Gives a token for each word
  """

  def __init__(self,max_vocab_size=-1):
    """  
    Description
    -----------
      Initialize vocabulary 

    Parameters
    ----------
    max_vocab_size : int
      maximum vocabulary size
    """
    # Members
    self.word_to_index = dict()
    self.index_to_word = dict()
    self.word_count = pd.Series(dtype=np.int32)
    self.unique_word_count = pd.Series(dtype=np.int32)
    self.prev_sentence_index = -1
    self.vocab_size = 0
    
    self.max_vocab_size = max_vocab_size

    self.word_to_index[' '] = self.vocab_size
    self.index_to_word[self.vocab_size] = '<empty>'
    self.vocab_size += 1
    self.word_count.loc['<empty>'] = GAMMA

    self.word_to_index['<unkown>'] = self.vocab_size
    self.index_to_word[self.vocab_size] = '<unkown>'
    self.vocab_size += 1
    self.word_count.loc['<unkown>'] = GAMMA

    self.tf_dict = {}

  def __len__(self):
    """
    Description
    -----------
      Get the size of vocabulary
    """
    return self.vocab_size

  def __getitem__(self,key):
    """
    Description
    -----------
      Get the size of vocabulary

    Parameters
    ----------
    key : int/str
      Index/word to get its corresponding word/index

    Returns
    -------
    int/str
      Query
    """

    # If key is string
    if type(key) == type('ss'):
      query = 1
      try: 
        query = self.word_to_index[key]
      except:
        pass
      return query
    # If key is integer
    elif type(key) == type(50):
      query = 0
      try:
        query = self.index_to_word[key]
      except:
        raise KeyError('Index out of range')
      return query
    # If key is an unknown type
    else:
      raise KeyError("Invalid key type, key must be string or integer")


  def add_word(self,word,sentence_index=0,sentence_len=1,calculate_tf = False):
    """
    Description
    -----------
      Add word to the vocabulary

    Parameters
    ----------
    word : str
      Word to be added

    Returns
    -------
    bool
      The state of adding the word (success/fail)
    """   
    
    try:
      self.word_count.loc[word] += 1
      if self.prev_sentence_index != sentence_index:
        self.unique_word_count[word] += 1
        self.prev_sentence_index = sentence_index
      if calculate_tf:
        try:
          self.tf_dict[word][sentence_index] += 1/(sentence_len+GAMMA)
        except:
          self.tf_dict[word][sentence_index] = 1/(sentence_len+GAMMA)
    except:
      # If the vocab reached max size
      if self.vocab_size == self.max_vocab_size:
        return False
      # Adding new word
      self.word_count.loc[word] = 1
      self.unique_word_count.loc[word] = 1
      self.prev_sentence_index = sentence_index
      self.word_to_index[word] = self.vocab_size
      self.index_to_word[self.vocab_size] = word
      self.vocab_size += 1
      if calculate_tf:
        self.tf_dict[word] = {sentence_index:1/(sentence_len+GAMMA)}
    
    return True

  def create_vocab(self,df,X='Body',calculate_tfidf = False):
    for index,sentence in enumerate(df[X]):
      for word in sentence:
        my_vocab.add_word(word,index,len(sentence),calculate_tfidf)
    
    if calculate_tfidf:
      self.create_tfidf_matrix(df)

  def get_vocab_words(self):
    return self.index_to_word.values()

  def create_tfidf_matrix(self,df,X='Body'):
    self.tfidf_matrix = np.empty([len(df),self.vocab_size],dtype=np.float64)

    for word,dic in self.tf_dict.items():
      for index,word_tf in dic.items():
        idf = np.log(len(df) / self.unique_word_count.loc[word])
        self.tfidf_matrix[index,self.word_to_index[word]] = word_tf * idf

  
  def add_vectorizer(self,vectorizer):
    """
    Description
    -----------
      Add vectorized object

    Parameters
    ----------
    word : str
      Word to be added

    Returns
    -------
    bool
      The state of adding the word (success/fail)
    """
    self.vectorizer = vectorizer

  def word_to_vector(self,word):
    vector = np.zeros(EMBEDDING_DIM)
    try:
      vector = self.vectorizer[word]
    except:
      pass
    return vector

  def create_embedding_matrix(self):
    self.embedding_matrix = np.empty([self.vocab_size,EMBEDDING_DIM],dtype=np.float64)
    for index,word in self.index_to_word.items():
      self.embedding_matrix[index] = self.word_to_vector(word)


# Other Data

In [93]:
# Read, then decode for py2 compat.
def extract_data(path, thresh = 70, on_shatrs = False):
  global vocab
  
  text = ""
  
  X = []
  y = []
    
  t = open(path, 'r').read()
  #t = araby.strip_tashkeel(t)
  # remove some exteranous chars 
  execluded = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
  out = ""
  
  for char in t:
    #if char not in execluded:
    out += char
      
  text += out
  baits = out.split('\n')
  for line in baits:
    if len(line) <= 1:
      continue
    label, bait = line.split(' ', 1)
    label = int(label)

    bait  = bait.strip()
    if on_shatrs:
      shatrs = bait.split('#')
      for shatr in shatrs:
        X.append(shatr.strip())
        y.append(label)
    else:
      X.append(bait.strip())
      y.append(label)
  
  #create the vocab 
  vocab = sorted(set(' '.join(X)))  
  
  #shuffle the data 
  X, y = shuffle(X, y)
  return X, y

In [94]:
from sklearn.utils import shuffle


In [95]:
X, y = extract_data("final_baits/test.txt", on_shatrs=False)

In [96]:
long_data = pd.DataFrame(X).rename(columns={0:'data'})
long_data['labels'] = y
long_data

Unnamed: 0,data,labels
0,بِكُلِّ تَقديسَةٍ يُرَدِّدُها # فيكَ النَصاري ...,4
1,تعالَت ذاتُ مَولايَ # عَن الإِدراكِ بِاللَحظِ,12
2,بِوادي لَكَ بِالشَوق الَّذي # في فُؤادي لا تدَ...,7
3,ومن ألفناه كان الموت فرّقنا # لا يمنع الموت جي...,8
4,واسأل الرحمن لي في حاجتي # التي في النفس منها ...,7
...,...,...
8311,دِيَماً في كُلِّ يَومٍ وَوَبلاً # وَاِغتِباقاً...,5
8312,وَلقدْ أَنْذَرْتُهُ فَرَأَيْتُهُ # جَاهِلِيَّا...,9
8313,في كفّه نبعةٌ مُوَتَّرة # يهزج ابياضًها ويهتضِبُ,4
8314,كَأَنَّهُم لِلعُيون تَبصرة # كَأَنَّهُم في الق...,4


In [97]:
ver_long_data = pd.concat([data,long_data],ignore_index=True)
ver_long_data = ver_long_data.drop(labels=["labels_text"], axis=1)
ver_long_data = ver_long_data.drop_duplicates()
ver_long_data = ver_long_data.reset_index().drop(labels=["index"], axis=1)

In [98]:
ver_long_data

Unnamed: 0,labels,data
0,8,أَنا الفقير وباللَه العظيم غني # لئن فقدتك في ...
1,10,وَلوعاً بِيُمنَى نَمْنَمَتْها حَدِيقَةٌ # نَزْ...
2,11,فيا منْ لم أزلْ أحظى لديه # بفضلٍ جامعٍ بابَ ا...
3,9,وَسَلامٌ عَلَى ضَرِيحِكَ مَا أَهْ # دَتْ شَذَا...
4,8,أمِنْتُ فقري لما قُلتُ عن ثِقَةٍ # أنْ لا جواد...
...,...,...
1471171,0,أَثْقَلَنِي بِالْبِرِّ حَتَّى لَقَدْ # أَعْجِز...
1471172,8,إلا لحالَيْن فَقْدِ العقلِ والدينِ # لو رام ذل...
1471173,7,حارَبوا الجَهلَ وَكانوا قَبلَنا # في دُجى عَمي...
1471174,9,فَقَطَفْتُ الشَّقِيقَ منْ وجْهِهِ # وَاغْتَبَق...
