# Arabic Language Pre-Processing

In this notebook I'll clean the data and split it to train,val,test for using with different models.

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import re
import string
import emoji

In [2]:
data = pd.read_csv("../input/arabic-dialect/dialect_data.csv",lineterminator='\n')
data.head()

Unnamed: 0,id,dialect,text
0,1175358310087892992,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,IQ,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458197 entries, 0 to 458196
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       458197 non-null  int64 
 1   dialect  458197 non-null  object
 2   text     458197 non-null  object
dtypes: int64(1), object(2)
memory usage: 10.5+ MB


## Helper Functions and variables for cleaning


In [4]:
# I don't need numbers and some tweets contained arabic numbers so I added them as punctuations.
arabic_punctuations = '''`÷×؛<>()*&^%][،/:"؟.,'{}~¦+|!”…“–•۱۲۳٤٥٦٧۸۹٠'''
translator = str.maketrans('', '', arabic_punctuations)

def give_emoji_free_text(text):
    # this function is better than encoding emojis pattern because of optimization.
    return emoji.get_emoji_regexp().sub(r'', text)

def normalizeArabic(text):
    text = text.strip()
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    text = ''.join(ch for ch in text)
    return text


In [5]:
def pre_processing(text):

    # removing longitaion
    text = re.sub(r'(.)\1+', r"\1\1", text)
    # remove hashtags
    text = re.sub(r'#', '', text)
    # removing user name
    text = re.sub('@[^\s]+', ' ', text)
    #Convert www.* or https?://* to " "
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    
    # removing \n because I found it in some tweets while insepecting the data
    text = re.sub(r"\n"," ",text)
    # removing english characters and numbers
    text = re.sub(r'[A-Za-z0-9]', '', text)
    # removing puntication at last because of # and @
    text = text.translate(translator)

    # this one to remove underscore from text because it was affecting hashtags when it was in puntications
    text = re.sub(r'_',' ',text)

    # removing emoji and tashikel
    text = give_emoji_free_text(text)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ     # Sukun
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)

    
    # normalizing text
    text = normalizeArabic(text)
    
    # removing space more than 1
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

In [6]:
text = '• ۱۲۳٠ ﻋﻤﺮﻱ ﻣﺎ ﻓﻜﺮﺕ ﺃﺑﻴﻊ ﺍﻷﻳﺎﻡ ﺍﻟﺤﻠﻮﺓ ﺍﻟﻠﻲ ﻋﺸﺘﻬﺎ ﻣﻊ ﺃﻱ ﺷﺨﺺ . ﻣﻤﻜﻦ ﺃﻧﺴﺤﺐ، ﺃﺗﺮﺍﺟﻊ، ﺃﺯﻋﻞ ﻭﺃﻛﺮﻩ ﺑﺸﻜﻞ ﻣﺆﻗﺖ، ﻟﻜﻦ ﻋﻤﺮﻱ ﻣﺎ ﻓﻜﺮﺕ ﺃﺑﻴﻊ . ﺍﻟﻤﺴﺄﻟﺔ ﻣﺴﺄﻟﺔ ﻣﺒﺪﺃ 😸🥀'
pre_processing(text)

'ﻋﻤﺮﻱ ﻣﺎ ﻓﻜﺮﺕ ﺃﺑﻴﻊ ﺍﻷﻳﺎﻡ ﺍﻟﺤﻠﻮﺓ ﺍﻟﻠﻲ ﻋﺸﺘﻬﺎ ﻣﻊ ﺃﻱ ﺷﺨﺺ ﻣﻤﻜﻦ ﺃﻧﺴﺤﺐ ﺃﺗﺮﺍﺟﻊ ﺃﺯﻋﻞ ﻭﺃﻛﺮﻩ ﺑﺸﻜﻞ ﻣﺆﻗﺖ ﻟﻜﻦ ﻋﻤﺮﻱ ﻣﺎ ﻓﻜﺮﺕ ﺃﺑﻴﻊ ﺍﻟﻤﺴﺄﻟﺔ ﻣﺴﺄﻟﺔ ﻣﺒﺪﺃ'

## Applying to all text

In [7]:
data['clean_text'] = data['text'].apply(lambda x: pre_processing(str(x))) 

## Inspecting data more

In [8]:
# this cell to make sure that there is no length == 0 in any text!

data['length'] = data['clean_text'].apply(lambda x: len(x)) 

In [9]:
data[data['length']<13]

Unnamed: 0,id,dialect,text,clean_text,length
1404,446738898316959680,IQ,مو ؟... لو مو مو ؟. http://t.co/Xe7xDjh7lA,مو لو مو مو,11
3914,1165654350418649088,IQ,@dthvadk مو صح لو غلط,مو صح لو غلط,12
20396,1175015239508799488,LY,@adbav هههههه تي شن في 😂,هه تي شن في,11
21248,1140048130844635136,LY,@abdoosalama29 ههههههههههههههههههههههههههههههه...,هه تي اي بخت,12
21288,1137851453731680384,LY,ههههههههههههه تي شن في https://t.co/GSND4BokZ2,هه تي شن في,11
...,...,...,...,...,...
437244,1064234570021777408,BH,@DER3_ALJAZEERA @Toomaa_6 @aml_1_1 لآآآآآآآ آآ...,لاا اا اا اا,12
441174,1173995178048348160,BH,@hamadaoos212 @Toomaa_6 @umsaleh5_ صج صج ، بد بخت,صج صج بد بخت,12
442686,1141748042737041408,BH,@AliFadhel87 ما في اقل ل 🤔,ما في اقل ل,11
451908,991730844489437184,BH,@greeen_mojo هههههههه يهف كل شي ..,هه يهف كل شي,12


### tweets with length less than 12 seems unimportant so I'll drop them.

In [10]:
final_data = data[data['length']>11]

In [11]:
#final_data.info()
# dropping the text before cleaning column

final_data.drop("text", axis=1, inplace=True)
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458149 entries, 0 to 458196
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          458149 non-null  int64 
 1   dialect     458149 non-null  object
 2   clean_text  458149 non-null  object
 3   length      458149 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 17.5+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Spliting the data for train, val, test.

### First let's split train and (val + test) 60%-40% then val and test 20%-20%

In [12]:
train_df,test_val_df = train_test_split(final_data, test_size=0.2, random_state=42,stratify = final_data['dialect'])
test_df,val_df = train_test_split(test_val_df, test_size=0.5, random_state=42,stratify = test_val_df['dialect'])

In [13]:
print(len(train_df))
print(len(test_df))
print(len(val_df))

366519
45815
45815


In [14]:
# save them in csv files for next steps

train_df.to_csv("train.csv",index=False)
test_df.to_csv("test.csv",index=False)
val_df.to_csv("validation.csv",index=False)

### Pre-Processing techniques used:

- Removing all numbers, emojis and tashkill because we don't need them in predicting the dialect.
- Normalize the text.
- Removing punctautions.