# Import libraries

In [118]:
! pip install nltk

[0m

In [2]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993241 sha256=a2d6700095c305451a1b7e676be81ab76c1fc4b77d3a3c6632688c89bcb7fe1b
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
[0m

In [117]:
import numpy as np 
import pandas as pd
import re
from langdetect import detect_langs
import nltk
from nltk.corpus import stopwords

# Explore data

In [4]:
data = pd.read_csv("/kaggle/input/tweeter-data/raw_data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147725 entries, 0 to 147724
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       147725 non-null  int64 
 1   dialect  147725 non-null  object
 2   text     147725 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.4+ MB


In [5]:
# check duplicates 
data.duplicated().sum()

0

In [6]:
# check data balance
(data["dialect"].value_counts()).index, ((data["dialect"].value_counts()).values / len(data)) *100

(Index(['EG', 'LY', 'LB', 'SD', 'MA'], dtype='object'),
 array([39.0157387 , 24.7073955 , 18.69487223,  9.77085801,  7.81113556]))

### data impalance EG has most data sample

In [7]:
pd.options.display.max_rows = 200
pd.options.display.max_columns = 1900
pd.set_option('display.max_colwidth', 1000)

In [8]:
# display samples from data
data["text"][100:200]

100                                                                                                                                                                                                                                     @kawtheraljahmi الله يسلمك، بالتأكيد مفيدة.
101                                                                                                                                                                                                                                    @AAlkoat عيدك مبروك وعقبال داير إن شاء الله.
102                                                                                                                                                                                                                                    @MoheBishte تسلم محي، عيدك مبارك إن شاء الله
103                                                                                                                                                                         

# For cleaning we need
- remove english language
- remove numbers
- remove puncituation 
- remove under score
- remove emojy and strange language
- remove stop words 

In [9]:
def remove_english_language(text):
  pattern = r"[a-zA-Z\s]+"
  regex_pattern = re.compile(pattern)
  text = re.sub(regex_pattern," ",text)
  return text

In [10]:
def remove_numbers(text):
  pattern = r'\d+'
  regex_pattern = re.compile(pattern)
  text = re.sub(regex_pattern," ",text)
  return text

In [11]:
def remove_puncituation(text):
  pattern = r"[^\w\s]"
  regex_pattern = re.compile(pattern)
  text = re.sub(regex_pattern," ",text)
  return text

In [12]:
def remove_uderScore(text):
  pattern = r"_"
  regex_pattern = re.compile(pattern)
  text = re.sub(regex_pattern," ",text)
  return text

In [13]:
def remove_Tifinagh_characters(text):
  pattern = r'[\u2D30-\u2D7F]+'
  regex_pattern = re.compile(pattern)
  text = re.sub(regex_pattern," ",text)
  return text

In [91]:
def remove_additional_space(text):
  pattern = r"\s{2,}"
  regex_pattern = re.compile(pattern)
  text = re.sub(regex_pattern," ",text)
  return text

### try remove stop words

In [134]:
nltk.download('stopwords')
list_of_stop_words = stopwords.words('arabic')
type(list_of_stop_words)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


list

In [144]:
modified_list_stop_words = []
for word in list_of_stop_words:
    pattern = re.compile(r'\b[إ,أ]')
    # replace the first letter of each word with a dash (-) using re.sub()
    new_word = re.sub(pattern, 'ا', word)
    modified_list_stop_words.append(new_word)
    

In [148]:
def remove_stop_words(text):
    # define the regex pattern to match words to remove
    pattern = re.compile(r'\b(' + '|'.join(modified_list_stop_words) + r')\b')
    # remove words from the sentence that appear in the words_to_remove list using re.sub()
    clean_sentence = re.sub(pattern, '', text)
    # print the resulting clean sentence
    return clean_sentence

In [152]:
def all_cleaning(text):
    text = remove_english_language(text)
    text = remove_puncituation(text)
    text = remove_numbers(text)
    text = remove_uderScore(text)
    text = remove_Tifinagh_characters(text)
    text = remove_stop_words(text)
    text = remove_additional_space(text)
    return text

In [153]:
cleaned_data = data.copy(deep = False)

In [154]:
# apply to all data
cleaned_data["text"] = cleaned_data["text"].apply(all_cleaning)

In [155]:
cleaned_data.head()

Unnamed: 0,id,dialect,text
0,1009754958479151232,LY,قليلين ادب ومنافقين اختهم قريبتهم تتعاكس تقولي عليهم نشاط حقوق المرأة ردة فعلهم
1,1009794751548313600,LY,الليبيين متقلبين بالنسبة ليا ميليشياوي زمان وتوة
2,1019989115490787200,LY,تانيه شاب ليبي بيرتاح لبنت مختلفة ويلاحظ انها البنات وبيحس كأنه يعرفها زمان بعدين يتزوج وحدة منهن وممكن ولاثلاثة وتنقلب الرومانسية لعياط وشياط وتهزيب اند
3,1035479791758135168,LY,رانيا عقليتك متخلفة اولا الانسان يلي يحتاج اهل يخاف منهم علشان يكون محترم انسان قليل الادب ثانيا شن ذنب يلي معندهش اب خوت خوات يعني اليتيمة متستحقش تتزوج وثالثا ليش البنت لازم ادير حساب للراجل متستحقش يندارلها حساب عبدة
4,1035481122921164800,LY,شكلك متعقدة علشان الراجل تحبيه ازوج بنت يتيمة بنت معندهش خوت هدي اعصابك وفكينا التخلف امتاعك
