### Import necessary packages

In [1]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import nlp_helpers as nh

### Load Data

In [2]:
root_path = '../input/ar-dialect-data'
data_path = os.path.join(root_path, 'data_with_text.csv')
curr_path = os.path.join(root_path, 'curr.txt')

In [3]:
data = pd.read_csv(data_path, lineterminator='\n')
data.head()

Unnamed: 0,id,text,dialect
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,IQ
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ


### Steps:
1- **Clean hshtages**:  
 - hashtags may contain some useful words that may help with dialect identification
 - so, in this step, I will  remove "#" symbol, then split the hashtag by "_" under score, to get all words separated by space  

2- **Normalize text**:
- arabic alphapet contains letters like ('إأٱآ'), and people use them interchangeably, Example: (أحمد) and (احمد), this will increase vocabulary size, and the model may treat them as two differenct words
- normalization will include: ('إأٱآ') ----> (ا), ('ؤئ') ----> (ء), (ة) ----> (ه), and (ى) ----> (ي)   

3- **Keep arbic characters only**:
- after text normalization, we are sure that all arabic lettes are these 29 charecters (ابتثجحخدذرزسشصضطظعغفقكلمنهويء)
- this step will remove tashkeel like (ُ ), english words, emojis, links, etc..


4- **Remove repeated letters**:
- this step will keep only one letter of any arabic letter that is repeated "n" or more times, here I've set n=3.  
- example: (جوووول) ----> (جول), while keeping 2 repeated letters like (ممكن)    

5- **Tokenization**:
- now we are ready to tokenize all tweets.
- each tweet will be a list of words  

6 **Remove stop words**  

7- **Stemming**:
- stemming will lead to:
    * it will neutralize some words, like (بلّش) with `shaddah` which is a Lebanese word means "begin", and (بلاش) which means "free", it is widely used in Egypt, both of them will become (بلش)
    * so, this will make it difficult for the model to differentiate between the two dialects.
    * other words like (فطور) and (فطار), will become (فطر), this may increase ambiguity between two dialects
    * ***so, to test this effect on different models, and because stemming requires another order of the previous steps, I will perform it in another notebook.***


### Clean hashtags

In [4]:
# example before cleaning
data['text'][420]

'قالوا قادمون .. چان أحنا نروحلهم\nاذا مو گد السالفة ليش تحچون؟\n #كلمه_لشهداء_العراق https://t.co/33s1ta2JmD'

In [5]:
data['text'] = data['text'].apply(nh.clean_hashtags)

In [6]:
# example after cleaning
data['text'][420]

'قالوا قادمون .. چان أحنا نروحلهم\nاذا مو گد السالفة ليش تحچون؟\n كلمه لشهداء العراق https://t.co/33s1ta2JmD'

### Normalize text

In [7]:
# examples before normalization
print(data['text'][18]) # alef example
print(data['text'][458045]) # hamzah example
print(data['text'][458175]) # taa marbotah example
print(data['text'][24]) # alef layennah example

@sfer661 يأكلون بخيرنه ويهينون  موظفينه ..
@mimi1562  هنا السؤال اي واحد اقوى الكف لو لطراق 🙄
@Bh Elections @Anas Al Shaikh خيمة تجار شبعانين ما عليهم من الفقير
@ha   m   ed كل جماعة على قدر عقولها😂


In [8]:
data['text'] = data['text'].apply(nh.normalize_text)

In [9]:
# examples after normalization
print(data['text'][18]) # alef example
print(data['text'][458045]) # hamzah example
print(data['text'][458175]) # taa marbotah example
print(data['text'][24]) # alef layennah example

@sfer661 ياكلون بخيرنه ويهينون  موظفينه ..
@mimi1562  هنا السءال اي واحد اقوي الكف لو لطراق 🙄
@Bh Elections @Anas Al Shaikh خيمه تجار شبعانين ما عليهم من الفقير
@ha   m   ed كل جماعه علي قدر عقولها😂


### Remove any non arabic alphabet character

In [10]:
# data before removing, contains english words, emojis, etc.
data['text'].head()

0     @Nw8ieJUwaCAAreT لكن بالنهايه .. ينتفض .. يغير .
1    @7zNqXP0yrODdRjK يعني هذا محسوب علي البشر .. ح...
2                      @KanaanRema مبين من كلامه خليجي
3           @HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4                   @hmo2406 وين هل الغيبه  اخ محمد 🌸🌺
Name: text, dtype: object

In [11]:
data['text'] = data['text'].apply(nh.clean_doc_arabic)

In [12]:
# data after removing, contains english words, emojis, etc.
data['text'].head()

0                           لكن بالنهايه  ينتفض  يغير 
1     يعني هذا محسوب علي البشر  حيونه ووحشيه  وتطلب...
2                                  مبين من كلامه خليجي
3                            يسلملي مرورك وروحك الحلوه
4                              وين هل الغيبه  اخ محمد 
Name: text, dtype: object

### Remove repaeated letters

In [13]:
# example before removing
data['text'][443704]

'ااااااءءءءءء\n\nتاخروا شكلهم عندهم قرقاعون بمنطقه ثانيه\n '

In [14]:
# remove any letter repeated 3 or more times
n = 3
data['text'] = data['text'].apply(nh.remove_repeated, args=(n,))

In [15]:
# example after removing
data['text'][443704]

'اء\n\nتاخروا شكلهم عندهم قرقاعون بمنطقه ثانيه\n '

### Tokenization

In [16]:
data['text'] = data['text'].apply(nh.tokenize)

In [17]:
data['text'].head()

0                         [لكن, بالنهايه, ينتفض, يغير]
1    [يعني, هذا, محسوب, علي, البشر, حيونه, ووحشيه, ...
2                             [مبين, من, كلامه, خليجي]
3                       [يسلملي, مرورك, وروحك, الحلوه]
4                          [وين, هل, الغيبه, اخ, محمد]
Name: text, dtype: object

### Remove stop words

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
arabic_stopwords = stopwords.words('arabic')
arabic_stopwords[:10]

['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي']

In [20]:
# before removing stop words we need to remove tashkeel and normalize them
# I converted the result to "set" to remove duplicates after normalization
# also sets are faster than lists in "checking membership"
print('Stop words length before processing:', len(arabic_stopwords))
arabic_stopwords = set(map(nh.remove_tashkeel, arabic_stopwords))
arabic_stopwords = set(map(nh.normalize_text, arabic_stopwords))
print('Stop words length after processing:', len(arabic_stopwords))

Stop words length before processing: 754
Stop words length after processing: 622


In [21]:
# example before removing stopwords
data['text'][0]

['لكن', 'بالنهايه', 'ينتفض', 'يغير']

In [22]:
data['text'] = data['text'].apply(nh.remove_stopwords, args=(arabic_stopwords,))

In [23]:
# example after removing stopwords
data['text'][0]

['بالنهايه', 'ينتفض', 'يغير']

### Save as pickle file
it is better as `text` column contains python lists.

In [24]:

nh.save_pickle_file(data, 'preprocessed_data_no_stem.obj')