## Import Python libraries

In [2]:
import pandas as pd
import numpy as np
import requests
import json
import re

## Extracting data

In [3]:
dialect_dataset =  pd.read_csv("./dialect_dataset.csv")
dialect_dataset.head()

Unnamed: 0,id,dialect
0,1175358310087892992,IQ
1,1175416117793349632,IQ
2,1175450108898565888,IQ
3,1175471073770573824,IQ
4,1175496913145217024,IQ


## Transform

In [4]:
dialect_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458197 entries, 0 to 458196
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       458197 non-null  int64 
 1   dialect  458197 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


### Check datatype

In [5]:
print(dialect_dataset.dtypes)

id          int64
dialect    object
dtype: object


### Convert Missing Value Codes to NaNs

In [9]:
dialect_dataset = dialect_dataset.replace('', np.NaN)
dialect_dataset = dialect_dataset.replace('?', np.NaN)
dialect_dataset = dialect_dataset.replace('unknown', np.NaN)
dialect_dataset = dialect_dataset.replace('missing', np.NaN)
dialect_dataset.isnull().sum()

id         0
dialect    0
dtype: int64

### Duplicated

In [10]:
dialect_dataset.duplicated().sum()

0

### Check unique

In [11]:
dialect_dataset['dialect'].nunique()

18

In [12]:
dialect_dataset['dialect'].value_counts()

EG    57636
PL    43742
KW    42109
LY    36499
QA    31069
JO    27921
LB    27617
SA    26832
AE    26296
BH    26292
OM    19116
SY    16242
DZ    16183
IQ    15497
SD    14434
MA    11539
YE     9927
TN     9246
Name: dialect, dtype: int64

### Categorical Variable
Machine learning models require all input and output variables to be numeric.This means that if your data contains categorical data, you must encode it to numbers before you can fit and evaluate a model.

Some algorithms can work with categorical data directly.For example, a decision tree can be learned directly from categorical data with no data transform required (this depends on the specific implementation).

Many machine learning algorithms cannot operate on label data directly. They require all input variables and output variables to be numeric.

### Nominal and Ordinal Variables
Some categories may have a natural relationship to each other, such as a natural ordering.

If the variable has a natural ordering of values. This type of categorical variable is called an ordinal variable because the values can be ordered or ranked.

**Ordinal Variable.** Variable comprises a finite set of discrete values with a ranked ordering between values.

**Nominal Variable (Categorical).** Variable comprises a finite set of discrete values with no relationship between values.

### OneHotEncoder() vs pandas.get_dummies

### 
pd.get_dummies()
#### pros
##### performs dummy encoding in a single line of code.
##### it returns a pandas data frame with clean column names.
#### cons
##### It cannot learn the characteristics from the training data and hence is unable to propagate its findings onto the test dataset.
##### If the total number of unique values in a categorical column is not the same for my train set vs test set, you’ll get errors.In other words,if The categorical feature of the training data dosn't have the same feature of the test data may or may not have all the feature values, which may cause data mismatch issues while modeling.

#### OneHotEncoder().
does the same things as get dummies but in addition,

##### OneHotEncoder saves the exploded categories into it’s object.Saving exploded categories is extremely useful when you want to apply the same data pre-processing on the test data.so, it can transform the test dataframe from the saved exploded categories that you fit on the training set.

New feature categories can also be handled using handle_unknown=’ignore’ parameter for One hot encoder

In [13]:
dialect_dataset = pd.get_dummies(data=dialect_dataset, columns=['dialect'])
dialect_dataset

Unnamed: 0,id,dialect_AE,dialect_BH,dialect_DZ,dialect_EG,dialect_IQ,dialect_JO,dialect_KW,dialect_LB,dialect_LY,dialect_MA,dialect_OM,dialect_PL,dialect_QA,dialect_SA,dialect_SD,dialect_SY,dialect_TN,dialect_YE
0,1175358310087892992,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1175416117793349632,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1175450108898565888,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1175471073770573824,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1175496913145217024,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458192,1019484980282580992,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
458193,1021083283709407232,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
458194,1017477537889431552,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
458195,1022430374696239232,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
dialect_dataset.columns

Index(['id', 'dialect_AE', 'dialect_BH', 'dialect_DZ', 'dialect_EG',
       'dialect_IQ', 'dialect_JO', 'dialect_KW', 'dialect_LB', 'dialect_LY',
       'dialect_MA', 'dialect_OM', 'dialect_PL', 'dialect_QA', 'dialect_SA',
       'dialect_SD', 'dialect_SY', 'dialect_TN', 'dialect_YE'],
      dtype='object')

### lowercase columns

In [15]:
dialect_dataset.rename(columns=lambda x: x.strip().lower(), inplace=True)
dialect_dataset.columns

Index(['id', 'dialect_ae', 'dialect_bh', 'dialect_dz', 'dialect_eg',
       'dialect_iq', 'dialect_jo', 'dialect_kw', 'dialect_lb', 'dialect_ly',
       'dialect_ma', 'dialect_om', 'dialect_pl', 'dialect_qa', 'dialect_sa',
       'dialect_sd', 'dialect_sy', 'dialect_tn', 'dialect_ye'],
      dtype='object')

#### Load messages.csv
its a big file so we may use engine='python'

In [17]:
header_list = ["id", "messages"]

In [44]:
message_dataset=pd.read_csv('./messages.csv',names=header_list, skiprows=[0],engine='python')
message_dataset

Unnamed: 0,id,messages
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺
...,...,...
458751,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...
458752,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...
458753,,0
458754,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...


In [19]:
message_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458756 entries, 0 to 458755
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        458296 non-null  object
 1   messages  458661 non-null  object
dtypes: object(2)
memory usage: 7.0+ MB


### Convert Missing Value Codes to NaNs¶


In [45]:
message_dataset = message_dataset.replace('', np.NaN)
message_dataset = message_dataset.replace('?', np.NaN)
message_dataset = message_dataset.replace('unknown', np.NaN)
message_dataset = message_dataset.replace('missing', np.NaN)

In [46]:
message_dataset = message_dataset.dropna()

In [47]:
message_dataset.isnull().sum()

id          0
messages    0
dtype: int64

### Duplicated

In [48]:
message_dataset.duplicated().sum()

4

In [49]:
message_dataset[message_dataset.duplicated()]

Unnamed: 0,id,messages
458751,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...
458752,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...
458754,1057418989293485952,@mycousinvinnyys @hanyamikhail1 متهيالي دي شكو...
458755,1055620304465215616,@MahmoudWaked7 @maganenoo في طريق مطروح مركز ب...


In [50]:
message_dataset['id'] = message_dataset['id'].astype(int)
type(message_dataset['id'][0])

numpy.int64

### keep arabic letters

In [51]:
print(re.sub(r'[a-zA-Z?]', '', message_dataset['messages'][10]).strip())
print(re.sub(r'[a-zA-Z?]', '', message_dataset['messages'][0]).strip())

@1277 والله هذا الموضوع جداً حساس ويحير اتفق معك 😂😂😂بس انت لاتروح زايد عادي☺️
@8 لكن بالنهاية .. ينتفض .. يغير .


In [52]:
print(len(message_dataset['messages'][10]))
print(message_dataset['messages'][10])

85
@kamal1277New والله هذا الموضوع جداً حساس ويحير اتفق معك 😂😂😂بس انت لاتروح زايد عادي☺️


In [53]:
print(re.sub(r'[a-zA-Z?]', '', message_dataset['messages'][10]).strip())
print(re.sub(r'[a-zA-Z?]', '', message_dataset['messages'][0]).strip())

@1277 والله هذا الموضوع جداً حساس ويحير اتفق معك 😂😂😂بس انت لاتروح زايد عادي☺️
@8 لكن بالنهاية .. ينتفض .. يغير .


In [54]:
print(re.sub(r'[^\u0600-\u06FF]', ' ', message_dataset['messages'][10]).strip())
print(re.sub(r'[^\u0600-\u06FF]', ' ', message_dataset['messages'][0]).strip())

والله هذا الموضوع جداً حساس ويحير اتفق معك    بس انت لاتروح زايد عادي
لكن بالنهاية    ينتفض    يغير


In [55]:
# Function to apply the regular expression substitutions
def clean_arabic_text(text):
    # Remove non-Arabic letters and punctuation
    cleaned_text = re.sub(r'[^\u0600-\u06FF\s]', ' ', text)
    return cleaned_text.strip()

In [56]:
# Apply the function to the 'messages' column
message_dataset['messages'] = message_dataset['messages'].apply(clean_arabic_text)

# Display the original and cleaned DataFrame
print("\nDataFrame with Cleaned Arabic Text:")
print(message_dataset[['messages']])



DataFrame with Cleaned Arabic Text:
                                                 messages
0                           لكن بالنهاية    ينتفض    يغير
1       يعني هذا محسوب على البشر    حيونه ووحشيه    وت...
2                                     مبين من كلامه خليجي
3                               يسلملي مرورك وروحك الحلوه
4                                  وين هل الغيبه  اخ محمد
...                                                   ...
458749                       السحله ضيفي ي بتطلع لك سحليه
458751          متهيالي دي شكولاته الهالوين  فين المحل ده
458752  في طريق مطروح مركز بهيج  والمركز الي الي جمبه ...
458754          متهيالي دي شكولاته الهالوين  فين المحل ده
458755  في طريق مطروح مركز بهيج  والمركز الي الي جمبه ...

[458201 rows x 1 columns]


### combining data
#### Merge message_dataset and dialect_dummy datasets.
Pandas implements several of these fundamental building-blocks in the pd.merge() function and the related join() method of Series and Dataframe

In [57]:
full_dataset = pd.merge(message_dataset,dialect_dataset,on='id')
full_dataset

Unnamed: 0,id,messages,dialect_ae,dialect_bh,dialect_dz,dialect_eg,dialect_iq,dialect_jo,dialect_kw,dialect_lb,dialect_ly,dialect_ma,dialect_om,dialect_pl,dialect_qa,dialect_sa,dialect_sd,dialect_sy,dialect_tn,dialect_ye
0,1175358310087892992,لكن بالنهاية ينتفض يغير,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1175416117793349632,يعني هذا محسوب على البشر حيونه ووحشيه وت...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1175450108898565888,مبين من كلامه خليجي,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1175471073770573824,يسلملي مرورك وروحك الحلوه,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1175496913145217024,وين هل الغيبه اخ محمد,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458196,1019484980282580992,مبسوطين منك اللي باسطانا,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
458197,1021083283709407232,والله ماينده ابش يختي,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
458198,1017477537889431552,شو عملنا لك حنا تهربي مننا احنا مساكين ليش بتع...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
458199,1022430374696239232,الله يبارك فيها وبالعافيه,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
full_dataset.duplicated().sum()

4

In [59]:
full_dataset = full_dataset.replace('', np.NaN)
full_dataset = full_dataset.replace('?', np.NaN)
full_dataset = full_dataset.replace('unknown', np.NaN)
full_dataset = full_dataset.replace('missing', np.NaN)

In [60]:
full_dataset.isnull().sum()

id             0
messages      80
dialect_ae     0
dialect_bh     0
dialect_dz     0
dialect_eg     0
dialect_iq     0
dialect_jo     0
dialect_kw     0
dialect_lb     0
dialect_ly     0
dialect_ma     0
dialect_om     0
dialect_pl     0
dialect_qa     0
dialect_sa     0
dialect_sd     0
dialect_sy     0
dialect_tn     0
dialect_ye     0
dtype: int64

In [61]:
full_dataset[full_dataset['messages'].isna()]

Unnamed: 0,id,messages,dialect_ae,dialect_bh,dialect_dz,dialect_eg,dialect_iq,dialect_jo,dialect_kw,dialect_lb,dialect_ly,dialect_ma,dialect_om,dialect_pl,dialect_qa,dialect_sa,dialect_sd,dialect_sy,dialect_tn,dialect_ye
4457,471056063672750080,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
11433,418829744407592960,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
14196,1116083073697878016,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
18586,785954733043441664,,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
18608,895400665467564032,,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398705,602351471980601344,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
398727,910815596018896768,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
400645,842094542484582400,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
404166,331881299172278272,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [62]:
#let's drop the nan message 
full_dataset= full_dataset.dropna()
full_dataset.isnull().sum()

id            0
messages      0
dialect_ae    0
dialect_bh    0
dialect_dz    0
dialect_eg    0
dialect_iq    0
dialect_jo    0
dialect_kw    0
dialect_lb    0
dialect_ly    0
dialect_ma    0
dialect_om    0
dialect_pl    0
dialect_qa    0
dialect_sa    0
dialect_sd    0
dialect_sy    0
dialect_tn    0
dialect_ye    0
dtype: int64

In [63]:
full_dataset.to_csv('full_dataset.csv')