### Import necessary packages

In [6]:
import os
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm

In [7]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
api = 'https://recruitment.aimtechnologies.co/ai-tasks'
root_path = '/content/gdrive/MyDrive/aim'
data_path = os.path.join(root_path, 'dialect_dataset.csv')

In [9]:
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,id,dialect
0,1175358310087892992,IQ
1,1175416117793349632,IQ
2,1175450108898565888,IQ
3,1175471073770573824,IQ
4,1175496913145217024,IQ


### Convert `id` column to list of strings to get the data form `api`

In [10]:
data['id'] = data['id'].astype('str')

In [11]:
ids = data['id'].values.tolist()
ids[:5]

['1175358310087892992',
 '1175416117793349632',
 '1175450108898565888',
 '1175471073770573824',
 '1175496913145217024']

### Get tweets as responses to `ids`

In [13]:
all_responses = {}
for i in tqdm(range(0, len(ids), 1000)):
    response = requests.post(api, json=ids[i: i+1000]).json()
    all_responses.update(response)

100%|██████████| 459/459 [05:46<00:00,  1.32it/s]


### Merge with the old dataset

In [14]:
def add_text_to_df(df, text):
    text_df = pd.DataFrame(text.items(), columns=['id', 'text'])
    merged_df = pd.merge(text_df, df, on=['id'])

    return merged_df

In [15]:
data_text_df = add_text_to_df(data, all_responses)
data_text_df.head()

Unnamed: 0,id,text,dialect
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,IQ
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ


In [16]:
len(data_text_df)

458197

### An array of all dialects in this dataset

In [17]:
data_text_df['dialect'].unique()

array(['IQ', 'LY', 'QA', 'PL', 'SY', 'TN', 'JO', 'MA', 'SA', 'YE', 'DZ',
       'EG', 'LB', 'KW', 'OM', 'SD', 'AE', 'BH'], dtype=object)

In [18]:
data_text_df.isna().sum() # there is no missing values

id         0
text       0
dialect    0
dtype: int64

In [19]:
# check for duplicates
data_text_df.duplicated().sum()

0

In [20]:
save_path = os.path.join(root_path, 'data_with_text.csv')
data_text_df.to_csv(save_path, index=False)