# Location stopword list creating using the ACLED dataset

We extract location words from the 'country', 'admin1', 'admin2', and 'admin3' columns of the ACLED dataset and combine them together to make a stopwords list. For n-grams such that n > 1, e.g. 'addis abeba', both bigram 'addis abeba' and unigram 'addis' and 'abeba' are included in this list.

In [1]:
import numpy as np
import pandas as pd

In [2]:
acled_df = pd.read_csv(
        "/home/aadelucia/files/minerva/data/2014-01-01-2020-01-01_acled_reduced_all.csv",
        keep_default_na=False,  # Preserve "NA" country code
        parse_dates=[4],  # Event dates
    ).set_index(["iso3", "event_date"])
acled_df

Unnamed: 0_level_0,Unnamed: 1_level_0,data_id,iso,event_id_cnty,event_id_no_cnty,year,time_precision,event_type,sub_event_type,actor1,assoc_actor_1,...,admin3,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp
iso3,event_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
SDN,2020-01-01,6713858,729,SUD13572,13572,2020,1,Protests,Peaceful protest,Protesters (Sudan),,...,,Wad Medani,14.4004,33.5184,1,Radio Dabanga,National,"On 1 January 2020, demonstrators gathered in W...",0,1578512392
SDN,2020-01-01,6713859,729,SUD13573,13573,2020,1,Protests,Peaceful protest,Protesters (Sudan),Refugees/IDPs (Sudan),...,,Zamzam,13.4840,25.3075,1,Radio Dabanga,National,"On 1 January 2020, IDPs in Zamzam camp (North ...",0,1578512392
SDN,2020-01-01,6713863,729,SUD13566,13566,2020,1,Protests,Protest with intervention,Protesters (Sudan),Masalit Ethnic Group (Sudan); Refugees/IDPs (S...,...,,Ardamata IDP Camp,13.4757,22.4931,1,Radio Dabanga,National,"On 1 January 2020, IDPs blocked the road betwe...",0,1578512392
SDN,2020-01-01,6713868,729,SUD13567,13567,2020,1,Protests,Peaceful protest,Protesters (Sudan),Refugees/IDPs (Sudan),...,,Gereida,11.2754,25.1403,1,Radio Dabanga,National,"On 1 January 2020, IDPs in Gereida camp (South...",0,1578512392
SDN,2020-01-01,6713883,729,SUD13570,13570,2020,1,Protests,Peaceful protest,Protesters (Sudan),,...,,Khartoum,15.5725,32.5364,1,Radio Dabanga,National,"On 1 January 2020, demonstrators gathered in K...",0,1578512392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PAK,2014-01-01,5937857,586,PAK21260,21260,2014,1,Protests,Peaceful protest,Protesters (Pakistan),,...,Moro,Moro,26.6645,68.0015,1,Daily Regional Times (Pakistan),Subnational,The employees of the Taluka Municipal Administ...,0,1567539383
PAK,2014-01-01,5940950,586,PAK21255,21255,2014,2,Protests,Peaceful protest,Protesters (Pakistan),,...,Karachi East,Karachi,24.9056,67.0822,3,Daily Regional Times (Pakistan),Subnational,The members of the Shahri Ittehad staged a pro...,0,1567539386
PAK,2014-01-01,5937645,586,PAK21254,21254,2014,1,Protests,Peaceful protest,Protesters (Pakistan),,...,Hyderabad,Hyderabad,25.3942,68.3736,1,Daily Regional Times (Pakistan),Subnational,Various protest demonstrations were staged by ...,0,1567539383
CAF,2014-01-01,6706670,140,CEN1636,1636,2014,1,Riots,Violent demonstration,Rioters (Central African Republic),Muslim Group (Central African Republic),...,Bangui,Bangui,4.3612,18.5549,1,RCA (Central African Republic),National,Shots heard as MISCA forces disperse a group o...,0,1578503608


In [3]:
acled_df.iloc[0]

data_id                                                       6713858
iso                                                               729
event_id_cnty                                                SUD13572
event_id_no_cnty                                                13572
year                                                             2020
time_precision                                                      1
event_type                                                   Protests
sub_event_type                                       Peaceful protest
actor1                                             Protesters (Sudan)
assoc_actor_1                                                        
inter1                                                              6
actor2                                                               
assoc_actor_2                                                        
inter2                                                              0
interaction         

In [4]:
acled_df.iloc[1]

data_id                                                       6713859
iso                                                               729
event_id_cnty                                                SUD13573
event_id_no_cnty                                                13573
year                                                             2020
time_precision                                                      1
event_type                                                   Protests
sub_event_type                                       Peaceful protest
actor1                                             Protesters (Sudan)
assoc_actor_1                                   Refugees/IDPs (Sudan)
inter1                                                              6
actor2                                                               
assoc_actor_2                                                        
inter2                                                              0
interaction         

In [5]:
acled_df.iloc[1000]

data_id                                                       6688043
iso                                                               764
event_id_cnty                                                 THA7466
event_id_no_cnty                                                 7466
year                                                             2019
time_precision                                                      1
event_type                                                   Protests
sub_event_type                                       Peaceful protest
actor1                                          Protesters (Thailand)
assoc_actor_1                                                        
inter1                                                              6
actor2                                                               
assoc_actor_2                                                        
inter2                                                              0
interaction         

In [19]:
countries = acled_df['country'].apply(lambda x: x.lower()).unique()
locs = acled_df['location'].apply(lambda x: x.lower()).unique()

In [26]:
ad3 = acled_df['admin3'].apply(lambda x: x.lower()).unique()
ad3

array(['', 'mardan', 'sujawal', ..., 'tezo', 'bhairab', 'gaturi'],
      dtype=object)

In [20]:
concated = np.concatenate((countries, locs))

In [33]:
def get_set(acled_df, col_list):
    arr = acled_df[col_list[0]].apply(lambda x: x.lower()).unique()
    col_list = col_list[1:]
    for col in col_list:
        arr = np.concatenate((arr, acled_df[col].apply(lambda x: x.lower()).unique()))
    s = set(arr)
    if '' in s:
        s.remove('') # remove empty string
    return s

In [34]:
col_list = ['country', 'admin1', 'admin2', 'admin3']
agg_set = get_set(acled_df, col_list)

In [35]:
len(agg_set)

7494

In [37]:
list(agg_set)[:100]

['luanshya',
 'mithapukur',
 'khemis el khechna',
 'bafang',
 'bangourain',
 'blue crane route',
 'medea',
 'sri lanka',
 'ogbomosho north',
 'gampaha',
 'buhaguzi',
 'arua hill',
 'bizerte',
 'moramanga',
 'mossel bay',
 'magbema',
 'preaek prasab',
 'mbouda',
 'ibiono ibom',
 'kokona',
 'anezi',
 'kyankwanzi',
 'el-ibrahimiya',
 'upper denkyira west',
 'nakhon sawan',
 'chorfa',
 'sakhisizwe',
 'takaba',
 'simbandi brassou',
 'rakai',
 'ilebo',
 'su-ngai kolok',
 'ambohidratrimo',
 'shagamu',
 'tarash',
 'aweil north',
 'dadeldhura',
 'maroantsetra',
 'matli',
 'oyo east',
 'kadiogo',
 'ruhaama',
 'tichy',
 'larache',
 'bamnet narong',
 'butanuka',
 'mustang',
 'ruwenzori',
 'uriri',
 'tiebele',
 'erute',
 'dera bugti',
 'butaganzwa',
 'laikipia west',
 'nongoa',
 'kipangani',
 'sumbe',
 'kinieran',
 'sebdou',
 'tazemmourt',
 'quaidabad',
 'tapac',
 'dar el beida',
 'talat kwan',
 'kwango',
 'ba-phalaborwa',
 'mpanda',
 'markz abu qurqas',
 'bakool',
 'hamar jabjab',
 'khairpur tamew

In [46]:
agg_set_backup = agg_set.copy()

In [56]:
agg_set = agg_set_backup.copy()
split_list = []
combine_list = []
for entry in list(agg_set):
    tokens = entry.split(" ")
    for token in tokens:
        split_list.append(token)
    combine_list.append("".join(tokens))
agg_set.update(split_list)
agg_set.update(combine_list)
len(agg_set)

11781

In [58]:
'addisababa' in agg_set

True

# Resulting file is saved to /home/jzhan237/files/data/location_words.txt

In [59]:
with open('/home/jzhan237/files/data/location_words.txt', 'w') as f:
    for entry in agg_set:
        f.write(f'{entry}\n')