In [109]:
import pandas as pd
import numpy as np
import re
import editdistance

### Dialects
Modern Standard Arabic (MSA)

Maghrebi (MGH)
- primary : Morocco, Algeria, Tunisia
- Secondary: Mauritania, Libya, France, Germany

Egyptian (EGY)
- Primary: Egyptian
- Secondary: Sudan

Levantine (LEV)
- Primary: Lebanon, Jordan, Syria, Israel, Palestine

Iraqi (IRQ)
- Primary: Iraq
- Secondary: Kuwait

Gulf (GLF)
- Primary: Saudi Arabic, Qatar, Oman, UAE, Bahrain
- Secondary: Yemen, Somalia, Djibouti


In [2]:
country_to_dialect_map = {'PL':'LEV', 'SA':'GLF', 'EG':'EGY', 'YE':'GLF', 'IQ':'IRQ', 'BH':'GLF', 'SY':'LEV',
                          'LB':'LEV', 'AE':'GLF', 'MSA':'MSA', 'OM':'GLF', 'TN':'MGH', 'JO':'LEV', 'KW':'IRQ',
                          'SD':'EGY', 'QA':'GLF', 'MA':'MGH', 'DZ':'MGH', 'LY':'MGH', 'SO':'GLF', 'DJ':'GLF',
                          'MR':'MGH'}
nadi_country_to_code_map = {'Palestine':'PL', 'Jordan':'JO', 'Syria':'SY', 'Lebanon':'LB', 'Algeria':'DZ',
                            'Morocco':'MA', 'Tunisia':'TN', 'Egypt':'EG', 'Iraq':'IQ', 'Libya':'LY',
                            'United_Arab_Emirates':'AE', 'Mauritania':'MR', 'Saudi_Arabia':'SA', 'Bahrain':'BH',
                            'Djibouti':'DJ', 'Oman':'OM', 'Somalia':'SO', 'Kuwait':'KW', 'Yemen':'YE',
                            'Sudan':'SD', 'Qatar':'QA'
                           }

In [86]:
def count_arabic_chars(text):
    arabic_chars = re.findall(r'[\u0600-\u06FF]+', str(text))
    return sum(map(len, arabic_chars))

 

EMOJI_PATTERN = re.compile(
    "["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251" 
    "]+"
)

def clean_and_filter_all_sources(data):
    # tag links
    data['text'] = data.text.str.replace(r'http\S+', '@URL')
    # tag usernames
    data['text'] = data.text.str.replace(r"@\w*", '@USER')
    
    # tag emojis
    data['text'] = data.text.str.replace(EMOJI_PATTERN, '@EMOJI')
    # tag newline
    data['text'] = data.text.str.replace(r"\n", ' @NEWLINE ')
    
    # remove multiple symbols in a row and strip
    data['text'] = data.text.str.replace(r"[#$%@^&*()_+-={}[];:<>,./\|~]+", '').str.strip()
    
    # filter out strings with few arabic characters and not too much other
    data['num_arabic_chars'] = data.text.apply(count_arabic_chars)
    filtered = data.loc[(data.num_arabic_chars > 15) & (data.num_arabic_chars > 3.0 / 4 * data.text.str.len()) & (data.num_arabic_chars < 80)]
    
    return filtered

## wont deduplicate actually
# def deduplicate(data):
    


## QADI

In [4]:
qadi_test = pd.read_csv("raw_data/QADI_test.txt", sep='\t', names=['text', 'country'])

In [5]:
qadi_test.country.unique()

array(['PL', 'SA', 'EG', 'YE', 'IQ', 'BH', 'SY', 'LB', 'AE', 'MSA', 'OM',
       'TN', 'JO', 'KW', 'SD', 'QA', 'MA', 'DZ', 'LY'], dtype=object)

In [6]:
qadi_test['split'] = 'test'
qadi_test['source'] = 'QADI'
qadi_test['dialect'] = qadi_test.country.map(country_to_dialect_map)

In [7]:
qadi_test['text'] = qadi_test.text.str.replace('EMOJI', '@EMOJI').str.replace('URL', '@URL').str.replace('NUM', '@NUM').str.replace('NEWLINE', '@NEWLINE')
qadi_test.head()

Unnamed: 0,text,country,split,source,dialect
0,@USER امممم لا لا @EMOJI ماشي حالو @EMOJI,PL,test,QADI,LEV
1,يا عم لو يحكولي سافر شالح حوافق بس سفروني @URL,PL,test,QADI,LEV
2,@USER ههههههههه قرار أقفال المحلات الساعة @NUM...,SA,test,QADI,GLF
3,عندنا بيسجلو مواد انساني عشان ترفعلهم ال jpa ر...,EG,test,QADI,EGY
4,@USER يا فديت بنت اليمن وصوتها والله يجنن ربي ...,YE,test,QADI,GLF


In [8]:
qadi_test.text.str.extractall(r'@([A-Z]+)')[0].unique()

array(['USER', 'EMOJI', 'URL', 'NUM', 'NEWLINE'], dtype=object)

## SHAMI + TSAC

In [9]:
iadd = pd.read_json("raw_data/IADD.json")

In [10]:
iadd.head()

Unnamed: 0,Country,DataSource,Region,Sentence
0,,DART,GLF,: وش فيك تسألني إذا كنت غالي؟ غالي وتسوى من ...
1,,DART,GLF,روان بن حسين مستحيل ما ادز شي بسناب حتى لو ما...
2,,DART,GLF,: ما نسيتك بالدعا والأرض جفاف، وشلون أبنساك و...
3,,DART,GLF,: فارس_البقميk_محب أطيب من الطيب واصل الطيب ...
4,,DART,GLF,شوفو والله ابوها كشخه وصغير احس واضحه الفلوس م...


In [11]:
iadd_country_to_code_map = {'NA': 'unknown',
'Palestine':'PL', 'Jordan':'JO', 'Syria':'SY', 'Lebanon':'LB', 'Algeria':'DZ',
       'Morocco':'MA', 'Tunisia':'TN', 'Egypt':'EG', 'Iraq':'IQ'}

In [12]:
shami_tsac = iadd.loc[~iadd.DataSource.isin(['DART', 'PADIC', 'AOC'])]
shami_tsac.groupby(['Region', 'Country']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,DataSource,Sentence
Region,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
LEV,Jordan,7017,7017
LEV,Lebanon,10829,10829
LEV,Palestine,10642,10642
LEV,Syria,37759,37759
MGH,Tunisia,11998,11998


In [13]:
shami_tsac = shami_tsac.rename(columns={'DataSource':'source', 'Region':'dialect', 'Sentence':'text'})
shami_tsac.head()

Unnamed: 0,Country,source,dialect,text
1151,Palestine,SHAMI,LEV,هه اصلا حاليا فى وفد قطري موجود بغزه واللعبه ...
1152,Palestine,SHAMI,LEV,عادي طعميها لأنها ما راح تكتفي من صدرك صار بد...
1153,Palestine,SHAMI,LEV,اعملي اللي عليكي لوجه الله
1154,Palestine,SHAMI,LEV,حكت شفتها اكتر من مره بلبس قصير واكيد زوجي شا...
1155,Palestine,SHAMI,LEV,و هيا وإياه بتزوجو هه


In [14]:
shami_tsac['split'] = 'train'
shami_tsac['country'] = shami_tsac.Country.map(iadd_country_to_code_map)
shami_tsac.drop(['Country'], axis=1)

Unnamed: 0,source,dialect,text,split,country
1151,SHAMI,LEV,هه اصلا حاليا فى وفد قطري موجود بغزه واللعبه ...,train,PL
1152,SHAMI,LEV,عادي طعميها لأنها ما راح تكتفي من صدرك صار بد...,train,PL
1153,SHAMI,LEV,اعملي اللي عليكي لوجه الله,train,PL
1154,SHAMI,LEV,حكت شفتها اكتر من مره بلبس قصير واكيد زوجي شا...,train,PL
1155,SHAMI,LEV,و هيا وإياه بتزوجو هه,train,PL
1156,SHAMI,LEV,تاني اشي امسحي بكاز لانه بقتل البراغيت,train,PL
1157,SHAMI,LEV,بدنا نعمل مسابقه لأحلى صوره بروفايل,train,PL
1158,SHAMI,LEV,الناس بكل الاحوال رح تضل تحكي,train,PL
1159,SHAMI,LEV,حبيبتي يا هدهد تسلمي عالاطراء الجميل,train,PL
1160,SHAMI,LEV,الأصح انو قبل ما تكشف حماها تجيب دليل ع عمايل...,train,PL


In [15]:
shami_tsac.loc[115429].str.replace(EMOJI_PATTERN, '@EMOJI').str.replace(r"\n", ' @NEWLINE ').str.replace(r"[#$%@^&*\(\)_+-={}\[\];:<>,./\\\|~]{2,}", '').text

'Ya3tik essa7a ja3four 3jebtni barcha fil emision bklemek w jawek _xD_ @NEWLINE yhabloooo @EMOJI @NEWLINE روعة @NEWLINE Zouz ma7lahom ty ebkatherte mani mrakza em3ohom edou5a la3bet bia hhhhhh @NEWLINE مزالت البركة كرمكالله @EMOJI @NEWLINE BrvooooooO @NEWLINE تلقائية @NEWLINE @EMOJI @NEWLINE Ma7leha @NEWLINE A7LA Couple @NEWLINE Wlh m3alm kol klma fi 7a9ha @NEWLINE بالحق برااااااافو '

## DART

In [65]:
dialects = ['MGH', 'EGY', 'LEV', 'IRQ', 'GLF']
dart_train = pd.concat([pd.read_csv(f"raw_data/DART/cf-data/{d}.txt", sep='\t') for d in dialects],
                       keys=dialects)
dart_dev = pd.concat([pd.read_csv(f"raw_data/DART/cf-data/gold/{d}.txt", sep='\t').rename(columns={'lable':'label'}) for d in dialects],
                     keys=dialects)
dart_test = pd.concat([pd.read_csv(f"raw_data/DART/eval-acc/{d}.txt", sep='\t') for d in dialects],
                      keys=dialects)

In [66]:
dart_train.columns = ['score', 'tweet_id', 'text']
dart_train = dart_train.loc[dart_train.score >= 3]
dart_train['dialect'] = dart_train.index.get_level_values(0)
dart_train.reset_index(drop=True, inplace=True)

In [67]:
dart_test = dart_test.rename(columns={'lable':'dialect', 'tweet_text':'text'}).reset_index(drop=True)
dart_dev = dart_dev.rename(columns={'label':'dialect', 'tweet_text':'text'}).reset_index(drop=True)

In [68]:
dart = pd.concat([dart_train[['text', 'dialect']], 
                  dart_dev[['text', 'dialect']], 
                  dart_test[['text', 'dialect']]], 
                 keys=['train', 'dev', 'test'])
dart['split'] = dart.index.get_level_values(0)
dart['dialect'] = dart.dialect.str.replace('MSA ', 'MSA')
dart['country'] = 'unknown'
dart = dart.loc[dart.dialect != 'OTHER']
dart['source'] = 'DART'
dart.reset_index(drop=True, inplace=True)
dart.head()

Unnamed: 0,text,dialect,split,country,source
0,كلمات اخوكم راشي ياسين لي عندهم الغيره فالقلب ...,MGH,train,unknown,DART
1,@anwarmalek كون لقاو الخير فالجزائر ماكانوش اص...,MGH,train,unknown,DART
2,يتحدثون عن تسليم المشعل للشباب ثم يرفضون تسليم...,MGH,train,unknown,DART
3,RT @TajMaroc: @dora22danya @illy_ylli @salmabo...,MGH,train,unknown,DART
4,@Abdel_72 @FLqadiri ديال الكصبة تيكون حمر واقيلا,MGH,train,unknown,DART


In [69]:
dart.loc[dart.dialect == 'IRQ', 'country'] = 'IQ'
dart.loc[dart.dialect == 'EGY', 'country'] = 'EG'

## NADI

In [20]:
nadi = pd.concat([pd.read_csv("raw_data/NADI/train_labeled.tsv", sep='\t'),
                pd.read_csv("raw_data/NADI/dev_labeled.tsv", sep='\t')],
                keys=['train', 'dev'])
nadi.columns = ['tweet_id', 'text', 'country_full', 'province']
nadi['split'] = nadi.index.get_level_values(0)
nadi['source'] = 'NADI'
nadi.reset_index(drop=True, inplace=True)

In [21]:
nadi['country'] = nadi.country_full.map(nadi_country_to_code_map)
nadi['dialect'] = nadi.country.map(country_to_dialect_map)
nadi.head()

Unnamed: 0,tweet_id,text,country_full,province,split,source,country,dialect
0,TRAIN_1,الفار العور يشوف فقط كيسي ومايشوف ماتويد,Iraq,iq_Al-Anbar,train,NADI,IQ,IRQ
1,TRAIN_2,ي دينيييي ربنا يستر,Egypt,eg_Alexandria,train,NADI,EG,EGY
2,TRAIN_3,أساساً نسبكم قذر ونجس بلاش تتفاخروا بنجاستكم ي...,Iraq,iq_Maysan,train,NADI,IQ,IRQ
3,TRAIN_4,ليْسَت كُل المَشَاعِرِ تَحْتَاجُ إلى حَبِيب بَ...,Morocco,ma_Oriental,train,NADI,MA,MGH
4,TRAIN_5,لأ ني حاضرها هذي لايف,Libya,ly_Al-Jabal-al-Akhdar,train,NADI,LY,MGH


## AOC

In [22]:
aoc = pd.concat([pd.read_csv('raw_data/ArabicOnlineCommentary/MultiTrain.Shuffled.csv'),
                 pd.read_csv('raw_data/ArabicOnlineCommentary/MultiTest.csv'),
                 pd.read_csv('raw_data/ArabicOnlineCommentary/MultiDev.csv')], 
                keys=['train', 'dev', 'test'])
aoc['split'] = aoc.index.get_level_values(0)
aoc = aoc.set_index(aoc.columns[0])
aoc.index.name = None

In [23]:
aoc['dialect'] = aoc.label.str.replace('DIAL_', '')
aoc['source'] = 'AOC'
aoc['country'] = aoc.label.map({'EGY': 'EG', 'IRQ': 'IQ'}).fillna('unknown')

In [24]:
aoc.groupby(['split', 'dialect']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,text,source,country
split,dialect,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dev,EGY,1252,1243,1252,1252
dev,GLF,2073,2066,2073,2073
dev,LEV,1133,1126,1133,1133
dev,MSA,6354,6269,6354,6354
test,EGY,1253,1252,1253,1253
test,GLF,2075,2069,2075,2075
test,LEV,1136,1130,1136,1136
test,MSA,6356,6229,6356,6356
train,EGY,10022,9970,10022,10022
train,GLF,16593,16494,16593,16593


## MADAR

In [25]:
filenames = ['26-train', '26-test', '26-dev', '6-train', '6-dev']
#[for f in filenames]
madar = pd.concat([pd.read_csv(f'raw_data/MADAR-SHARED-TASK-final-release-25Jul2019/MADAR-Shared-Task-Subtask-1/MADAR-Corpus-{f}.tsv', 
            sep='\t',
           names=['text', 'city']) for f in filenames], keys=['train', 'test', 'dev', 'train6', 'dev6'])
madar['split'] = madar.index.get_level_values(0)
madar = madar.droplevel(0)

In [26]:
madar.city.unique()

array(['MSA', 'BEI', 'CAI', 'DOH', 'RAB', 'TUN', 'ALX', 'ALG', 'AMM',
       'ASW', 'DAM', 'JED', 'JER', 'RIY', 'SAN', 'SFX', 'ALE', 'BAG',
       'BAS', 'BEN', 'FES', 'KHA', 'MOS', 'MUS', 'SAL', 'TRI'],
      dtype=object)

In [27]:
madar_city_to_country_map = {'MSA':'unknown', 'BEI':'LB', 'CAI':'EG', 'DOH':'QA', 'RAB':'MA', 'TUN':'TN', 'ALX':'EG', 
                             'ALG':'DZ', 'AMM':'JO', 'ASW':'EG', 'DAM':'SY', 'JED':'SA', 'JER':'PL', 'RIY':'SA',
                             'SAN':'YE', 'SFX':'TN', 'ALE':'SY', 'BAG':'IQ', 'BAS':'IQ', 'BEN':'LY', 'FES':'MA',
                             'KHA':'SD', 'MOS':'IQ', 'MUS':'OM', 'SAL':'JO', 'TRI':'LY'}
madar['country'] = madar.city.map(madar_city_to_country_map)

In [28]:
madar['dialect'] = madar.country.map(country_to_dialect_map).fillna('MSA')

In [57]:
madar.groupby(['split', 'dialect']).count()
madar['split'] = madar.split.str.replace('6', '')
madar['source'] = 'MADAR'

## Dial2MSA

In [137]:
filenames = ['EGY2MSA.xls', 'MGR2MSA.xls']
dial2msa = pd.concat([pd.read_excel(f"raw_data/Dial2MSA/{f}") for f in filenames], keys=['EGY', 'MGH'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [138]:
dial2msa['split'] = 'train'
dial2msa['source'] = 'Dial2MSA'
dial2msa['dialect'] = dial2msa.index.get_level_values(0)
dial2msa = dial2msa.reset_index(drop=True)
dial2msa.loc[~dial2msa.cleanedtweet.isna(), 'tweet'] = dial2msa.loc[~dial2msa.cleanedtweet.isna(), 'cleanedtweet']

In [139]:
# Function to check similarity between two strings
def are_strings_similar(row):
    distance = editdistance.eval(row.msa, row.tweet)
    return distance <= 10 # threshold can be adjusted

# Apply the function to filter rows
dial2msa['are_strings_similar'] = dial2msa.apply(are_strings_similar, axis=1)
dial2msa = dial2msa.loc[~dial2msa.are_strings_similar] # throw out where MSA to dialectal too similar

In [140]:
dial2msa.columns

Index(['cleanedtweet', 'egytomsa:confidence', 'id', 'mgrtomsa:confidence',
       'msa', 'tweet', 'split', 'source', 'dialect', 'are_strings_similar'],
      dtype='object')

In [141]:
dial2msa_msa = dial2msa[['msa', 'split', 'source']].rename(columns={'msa':'text'})
dial2msa_msa['dialect'] = 'MSA'
dial2msa_da = dial2msa[['tweet', 'split', 'source', 'dialect']].rename(columns={'tweet':'text'})
dial2msa = pd.concat([dial2msa_msa, dial2msa_da])
dial2msa['country'] = 'unknown'

## Joining them all together and finalizing

In [142]:
formatted_datasets = [qadi_test]
unformatted_datasets = [shami_tsac, aoc, nadi, dart, madar, dial2msa]
preclean_columns = ['text', 'split', 'source', 'dialect', 'country']
final_columns = preclean_columns + ['num_arabic_chars']

unf_joined = pd.concat([tbl[preclean_columns] for tbl in unformatted_datasets])

In [143]:
# remove multiple symbols in a row and strip
qadi_test['text'] = qadi_test.text.str.replace(r"[#$%@^&*()_+-={}[];:<>,./\|~]+", '').str.strip()
    
# filter out strings with few arabic characters and not too much other
qadi_test['num_arabic_chars'] = qadi_test.text.apply(count_arabic_chars)
qadi_test_filtered = qadi_test.loc[(qadi_test.num_arabic_chars > 15) & (qadi_test.num_arabic_chars > 3.0 / 4 * qadi_test.text.str.len()) & (qadi_test.num_arabic_chars < 80)]

In [144]:
final = pd.concat([clean_and_filter_all_sources(unf_joined), qadi_test_filtered[final_columns]])

In [None]:
# check for messed up labels

In [145]:
final.sample(15)

Unnamed: 0,text,split,source,dialect,country,num_arabic_chars
28028,تذكرتين درجة أولى ، لو سمحت .,train,MADAR,GLF,QA,22
5087,يوجد اشياء ان لم تاتي في موعدها المحدد تعتبر ع...,train,Dial2MSA,MSA,unknown,46
7657,صاحب صفحة معلومات قد تخيفك ماذا تريد مني يا هذا,train,Dial2MSA,MSA,unknown,38
7805,انور طريقوا بشمعة بخمس,train,AOC,EGY,unknown,19
38313,ممكن بعض لحم الخنزير والبيض ؟,train,MADAR,GLF,OM,24
35233,ازا منتأخر شي ضليتي تقليلي بيدبرو حالهن ضليتي ...,train,SHAMI,LEV,SY,57
2909,عندك المنيو بالانجليزي ؟,dev,MADAR,GLF,QA,21
34768,كنت بفكر في اخد الكورس داك السمستر الجاي .,train,MADAR,EGY,SD,33
24312,مازلنا مخذيناش قرار .,train,MADAR,MGH,TN,17
30516,المدير حبيبي ابو وليد انتو بتتفضلو عند السكرتي...,train,SHAMI,LEV,SY,79


In [146]:
final.groupby(['split', 'dialect']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,source,country,num_arabic_chars
split,dialect,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dev,EGY,2334,2334,2334,2334
dev,GLF,2976,2976,2976,2976
dev,IRQ,699,699,699,699
dev,LEV,2171,2171,2171,2171
dev,MGH,2795,2795,2795,2795
dev,MSA,3526,3526,3526,3526
test,EGY,829,829,829,829
test,GLF,1825,1825,1825,1825
test,IRQ,430,430,430,430
test,LEV,1324,1324,1324,1324


In [148]:
final.to_csv('full_cleaned_data.tsv', sep='\t')