### Import packages and read data

In [1]:
import requests     # to collect data
import pandas as pd # to read and save csv files

data = pd.read_csv('Data/dialect_dataset.csv')  # read data
print(data.shape) # to see the shape of this data
data.head() # to see data

(458197, 2)


Unnamed: 0,id,dialect
0,1175358310087892992,IQ
1,1175416117793349632,IQ
2,1175450108898565888,IQ
3,1175471073770573824,IQ
4,1175496913145217024,IQ


### Prepare lists which I will use

In [2]:
data_ids = []   # ids from data.id
for id in data.id:
    data_ids.append(str(id))  # convert id from int to string to can use it

In [3]:
# make list with len 458 which every element is a list with len 1000
list_1000 = [data_ids[i:i+1000] for i in range(0, len(data_ids)-197, 1000)]
# make list with len 1 which every element is a list with len 197
list_197 = [data_ids[i:i+197] for i in range(len(data_ids)-197, len(data_ids), 197)]

# if we see from list_1000 (485*1000=485000) and list_197 (1*197=197) 
# by adding two list it will be 485197 as rows in a shape of data

### Use post request to call API

In [4]:
# request method which I will use with list_1000 and list_197
# lists: is a list of lists I will use it 
def request(lists):
    # url I should use
    url = "https://recruitment.aimtechnologies.co/ai-tasks"

    ids = []    # to collect ids 
    texts = []  # to collect texts

    # loop in list in lists
    for js in lists:
        r = requests.post(url, json=js)   # request for every list 
        if r.status_code != 200:  # to see if request give me an error or not
            print(r.status_code)
        for key in r.json().keys():  # loop in keys (ids)
            ids.append(key)   # append key (id) in ids
        for value in r.json().values(): # loop in values (texts)
            texts.append(value) # append value (text) in texts

    return ids, texts

In [5]:
# make a request for list_1000
# ids contains ids and texts contains texts
ids, texts = request(list_1000)   
# make a request for list_197
# ids_197 contains ids and texts_197 contains texts      
ids_197, texts_197 = request(list_197) 

In [6]:
# See len of all ids and texts to see if it works as i want or not
print(len(ids))      
print(len(texts))
print(len(ids_197))
print(len(texts_197))

458000
458000
197
197


In [7]:
ids.extend(ids_197)  # add ids_197 to ids
texts.extend(texts_197) # add texts_197 to texts

# see if len is 458197 or not s
print(len(ids))
print(len(texts))

458197
458197


### Collect lists into dataframe to can save data as csv file

In [8]:
# convert ids and texts lists to dataframe
df = pd.DataFrame(list(zip(ids, texts)), columns =['Id', 'text'])
df.head()

Unnamed: 0,Id,text
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺


In [9]:
# concat old data and new data 
finaldata = pd.concat([df, data], axis=1, join='inner')
print(finaldata.shape)
finaldata.head(5)

(458197, 4)


Unnamed: 0,Id,text,id,dialect
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,1175358310087892992,IQ
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,1175416117793349632,IQ
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,1175450108898565888,IQ
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,1175471073770573824,IQ
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,1175496913145217024,IQ


In [10]:
newid = list(finaldata['Id'])  # new ids which I collected
oldid = list(finaldata['id'])  # old ids

lostdata = []  # list to Id and id which doesn't matches
for i in range(len(oldid)):
    if int(newid[i]) != oldid[i]:
        lostdata.append(oldid[i])  # append lost ids

# to see if any lost data or not. if(just delete rows) else dataset is good.
lostdata

[]

In [11]:
del finaldata['Id'] # delete Id column
finaldata = finaldata[['id','text','dialect']] # change order of columns
finaldata.head() # see finaldata

Unnamed: 0,id,text,dialect
0,1175358310087892992,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .,IQ
1,1175416117793349632,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...,IQ
2,1175450108898565888,@KanaanRema مبين من كلامه خليجي,IQ
3,1175471073770573824,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐,IQ
4,1175496913145217024,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺,IQ


In [12]:
# save data with name collected_data.csv
finaldata.to_csv('Data/collected_data.csv', index=False)