# Preprocessing our demo data

In [1]:
# -*- coding: utf-8 -*- #to be able to process arabic
import pandas as pd
import re
from camel_tools.utils.dediac import dediac_ar
from camel_tools.utils.charsets import AR_LETTERS_CHARSET
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.normalize import normalize_unicode
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.disambig.mle import MLEDisambiguator
mle_pretrained = MLEDisambiguator.pretrained()

In [2]:
#!pip install textblob

In [3]:
#!pip install spacy

In [4]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import numpy as np
import pandas as pd
import json
from textblob import TextBlob
import nltk
from scipy import spatial
import torch
import spacy

First, let's load our data and investigate it.

In [5]:
#!pip install jsonlines

In [6]:
#data = pd.read_csv("F:\Gp\Question Answering\QA.csv")
#import jsonlines
#text=[]
#with jsonlines.open('F:\Gp\Question Answering\qrcd_v1.1_train.jsonl') as f:
#    for line in f.iter():
#        text.append(line["passage"])
#train=pd.read_json(path_or_buf="F:\Gp\Question Answering\qrcd_v1.1_train.jsonl", lines=True)
#train.head()

    

In [7]:
stopwords = []
with open('F:\Gp/Question Answering/arabic-stop-words-master/list.txt', encoding='utf8') as file:
    for line in file:
        stopwords.append(line.strip())
print(stopwords)
print(type(stopwords))   

['،', 'ء', 'ءَ', 'آ', 'آب', 'آذار', 'آض', 'آل', 'آمينَ', 'آناء', 'آنفا', 'آه', 'آهاً', 'آهٍ', 'آهِ', 'أ', 'أبدا', 'أبريل', 'أبو', 'أبٌ', 'أجل', 'أجمع', 'أحد', 'أخبر', 'أخذ', 'أخو', 'أخٌ', 'أربع', 'أربعاء', 'أربعة', 'أربعمئة', 'أربعمائة', 'أرى', 'أسكن', 'أصبح', 'أصلا', 'أضحى', 'أطعم', 'أعطى', 'أعلم', 'أغسطس', 'أفريل', 'أفعل به', 'أفٍّ', 'أقبل', 'أكتوبر', 'أل', 'ألا', 'ألف', 'ألفى', 'أم', 'أما', 'أمام', 'أمامك', 'أمامكَ', 'أمد', 'أمس', 'أمسى', 'أمّا', 'أن', 'أنا', 'أنبأ', 'أنت', 'أنتم', 'أنتما', 'أنتن', 'أنتِ', 'أنشأ', 'أنه', 'أنًّ', 'أنّى', 'أهلا', 'أو', 'أوت', 'أوشك', 'أول', 'أولئك', 'أولاء', 'أولالك', 'أوّهْ', 'أى', 'أي', 'أيا', 'أيار', 'أيضا', 'أيلول', 'أين', 'أيّ', 'أيّان', 'أُفٍّ', 'ؤ', 'إحدى', 'إذ', 'إذا', 'إذاً', 'إذما', 'إذن', 'إزاء', 'إلى', 'إلي', 'إليكم', 'إليكما', 'إليكنّ', 'إليكَ', 'إلَيْكَ', 'إلّا', 'إمّا', 'إن', 'إنَّ', 'إى', 'إياك', 'إياكم', 'إياكما', 'إياكن', 'إيانا', 'إياه', 'إياها', 'إياهم', 'إياهما', 'إياهن', 'إياي', 'إيهٍ', 'ئ', 'ا', 'ا?', 'ا?ى', 'االا', 'االتى', 'اب

In [8]:
#data.iloc[0][1]

In [9]:
#data.shape

In [10]:
#data.info()

### 1) Cleaning our data: 
- Remove URLS
- Mentions
- Emotions
- Symbols
- Diacritical marks
- Shapes
- Unwanted Punctuation (basicly anything that is not an arabic word or a number).

In [11]:
def clean_text(text):
    sentence_ar_dediac = dediac_ar(text) #dediacritization using camel tools
    sentence_ar_dediac = re.sub(r"\S*https?:\S*|@\S+", "", sentence_ar_dediac) #remove URLs and mentions even if they are in paranthesis or brackets
    no_punc = ""
    for char in sentence_ar_dediac:
        if char in (list(AR_LETTERS_CHARSET)+[" "]): #removing any thing that is not an arabic letter
            no_punc = no_punc + char
    return no_punc

After Cleaning,
### 2) Normalizing our data
- Orthographic Normalization
- Unicode Normalization

In [12]:
def normalize_text(text):
    cleaned_text = normalize_alef_maksura_ar(text)
    cleaned_text = normalize_alef_ar(cleaned_text)
    cleaned_text = normalize_teh_marbuta_ar(cleaned_text)
    cleaned_text = normalize_unicode(cleaned_text)
    
    return cleaned_text

### 3) Data Enrichement
- Tokenization
- Removing stop words
- Morphology Analysis
- Lemmatization

In [13]:
def enrichement(text, stopwords=stopwords, mle_pretrained=mle_pretrained):
    tokens = simple_word_tokenize(text)
    tokenized = []
    for token in tokens:
        if token not in stopwords:
            tokenized.append(token)     
    #disambig = mle_pretrained.disambiguate(tokenized)
    #lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
    return tokenized

In [14]:
def preprocess_text(text, clean_text=clean_text, normalize_text=normalize_text, enrichement=enrichement):
    '''
    Inputs:
    text: string in which it will preprocess.
    
    Outputs:
    cleaned_text: the string after it was processed.
    '''
    
    #let's first start with cleaning 
    cleaned = clean_text(text)
            
    # now, for the normalization part
    normalized = normalize_text(cleaned)
    
    #enrichement
    enriched = enrichement(normalized)
    
    return enriched

In [15]:
def preprocess_data(data, preprocess_text=preprocess_text):
    '''
    Inputs:
    data: is the dataframe we want to apply our function on
    
    Outputs:
    data: the dataframe after applying the cleaning and normalization on each cell.
    
    '''
    for row in range(data.shape[0]):
        for column in range(data.shape[1]):
            for_cleaning = str(data.iloc[row][column])
            data.iloc[row][column] = preprocess_text(for_cleaning)
    return data
    

In [16]:
#data = preprocess_data(data)
#data.head()
#text="واذكر عبدنا أيوب إذ نادى ربه أني مسني الشيطان بنصب وعذاب. اركض برجلك هذا مغتسل بارد وشراب. ووهبنا له أهله ومثلهم معهم رحمة منا وذكرى لأولي الألباب. وخذ بيدك ضغثا فاضرب به ولا تحنث إنا وجدناه صابرا نعم العبد إنه أواب."
#text = text.lower().replace('\n', ' ').replace('\t', ' ').replace('\xa0',' ') #get rid of problem chars
#text = ' '.join(text.split())#a quick way of removing excess whitespace
text="من هو الذي اول المسلمين و شكرا"
text=preprocess_text(text)
text=' '.join(text)
text

'المسلمين شكرا'

In [17]:
#data.iloc[0][1]

In [18]:
text = re.sub("؟", "?", text) # replace any ؟ with ?
sentences = []
for sen in nltk.sent_tokenize(text):
    sentences.append(sen)
print(len(sentences))
print(sentences)

1
['المسلمين شكرا']


# Elmo

In [19]:
#!pip install tensorflow_hub

In [20]:
#!pip install tensorflow==1.15
#!pip install "tensorflow_hub>=0.6.0"
#!pip3 install tensorflow_text==1.15
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
#import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
#import tensorflow_text


Instructions for updating:
non-resource variables are not supported in the long term


In [21]:
url = "https://tfhub.dev/google/elmo/3"
embed = hub.Module(url)

KeyboardInterrupt: 

In [None]:
# This tells the model to run through the ‘sentences’ list and return the default output (1024 dimension sentence vectors).
embeddings = embed(sentences,signature="default",
                   as_dict=True)["elmo"]
#Start a session and run ELMo to return the embeddings in variable x
with tf.Session() as sess:  
  sess.run(tf.global_variables_initializer())
  sess.run(tf.tables_initializer())
  context = sess.run(embeddings)
# Number of sentences in the context:
print(len(context))
# Embeddings for the context:
print(context)

# AraBERT

In [None]:
#!pip install sentence_transformers

In [None]:
""""
from torch.utils.data import DataLoader
from torch import nn
from sentence_transformers import LoggingHandler, SentenceTransformer, util, models, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *

from sklearn.model_selection import train_test_split
import pandas as pd
import datetime
import os
import math
""""