# ACLED Dataset Cleaning and Initial Exploration

In [26]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
plt.rcParams["figure.dpi"] = 36

In [53]:
url = 'https://raw.githubusercontent.com/georgetown-analytics/ACLED/main/CSV_Main/step1_ACLED_Dataset_END.csv'
df = pd.read_csv(url, index_col=0)

In [54]:
df

Unnamed: 0_level_0,country,event_type,sub_event_type,fatalities,notes
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8166147,Democratic Republic of Congo,Battles,Armed clash,0,"On 1 June 2021, the ADF attacked a military ba..."
8166148,Sudan,Violence against civilians,Attack,5,"On 1 June 2021, three masked gunmen opened fir..."
8166410,Lesotho,Riots,Violent demonstration,0,"On 1 June 2021, workers pelted stones and loot..."
8166411,Lesotho,Riots,Violent demonstration,0,"On 1 June 2021, workers set tires on fire and ..."
8059405,Uganda,Violence against civilians,Attack,2,"On 1 June 2021, an unidentified armed group at..."
...,...,...,...,...,...
7121647,Nigeria,Riots,Mob violence,0,"On 1 June 2020, PDP supporters attacked APC su..."
7966971,Tunisia,Protests,Peaceful protest,0,"On 1 June 2020, aligned health workers protest..."
7121659,Somalia,Battles,Armed clash,0,"On 1 June 2020, Al shabaab militants launched ..."
7518716,South Africa,Protests,Peaceful protest,0,"On 1 June 2020, about 30 parents demonstrated ..."


In [87]:
import nltk
from string import digits
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()

In [57]:
#making all text in notes column lowercase
df["notes"]=df["notes"].str.lower()
df

Unnamed: 0_level_0,country,event_type,sub_event_type,fatalities,notes
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8166147,Democratic Republic of Congo,Battles,Armed clash,0,"on 1 june 2021, the adf attacked a military ba..."
8166148,Sudan,Violence against civilians,Attack,5,"on 1 june 2021, three masked gunmen opened fir..."
8166410,Lesotho,Riots,Violent demonstration,0,"on 1 june 2021, workers pelted stones and loot..."
8166411,Lesotho,Riots,Violent demonstration,0,"on 1 june 2021, workers set tires on fire and ..."
8059405,Uganda,Violence against civilians,Attack,2,"on 1 june 2021, an unidentified armed group at..."
...,...,...,...,...,...
7121647,Nigeria,Riots,Mob violence,0,"on 1 june 2020, pdp supporters attacked apc su..."
7966971,Tunisia,Protests,Peaceful protest,0,"on 1 june 2020, aligned health workers protest..."
7121659,Somalia,Battles,Armed clash,0,"on 1 june 2020, al shabaab militants launched ..."
7518716,South Africa,Protests,Peaceful protest,0,"on 1 june 2020, about 30 parents demonstrated ..."


In [59]:
'''
removing numbers (dates) -- still need to remove months...
'''
def remove_num(list):
    pattern = '[0-9]'
    list = [re.sub(pattern, '', i) for i in list]
    return list  

In [66]:
df["notes"] = remove_num(df["notes"])

In [77]:
df

Unnamed: 0_level_0,country,event_type,sub_event_type,fatalities,notes
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8166147,Democratic Republic of Congo,Battles,Armed clash,0,"on june , the adf attacked a military base in..."
8166148,Sudan,Violence against civilians,Attack,5,"on june , three masked gunmen opened fire aga..."
8166410,Lesotho,Riots,Violent demonstration,0,"on june , workers pelted stones and looted sh..."
8166411,Lesotho,Riots,Violent demonstration,0,"on june , workers set tires on fire and block..."
8059405,Uganda,Violence against civilians,Attack,2,"on june , an unidentified armed group attacke..."
...,...,...,...,...,...
7121647,Nigeria,Riots,Mob violence,0,"on june , pdp supporters attacked apc support..."
7966971,Tunisia,Protests,Peaceful protest,0,"on june , aligned health workers protested in..."
7121659,Somalia,Battles,Armed clash,0,"on june , al shabaab militants launched an at..."
7518716,South Africa,Protests,Peaceful protest,0,"on june , about parents demonstrated outside..."


In [81]:
tokenizer = RegexpTokenizer(r'\w+')
df["notes"] = df["notes"].apply(lambda x: tokenizer.tokenize(x.lower()))
df["notes"]

In [91]:
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [95]:
df["notes"] = df["notes"].apply(lambda x: word_lemmatizer(x))
df

Unnamed: 0_level_0,country,event_type,sub_event_type,fatalities,notes
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8166147,Democratic Republic of Congo,Battles,Armed clash,0,"[june, adf, attacked, military, base, village,..."
8166148,Sudan,Violence against civilians,Attack,5,"[june, three, masked, gunman, opened, fire, ci..."
8166410,Lesotho,Riots,Violent demonstration,0,"[june, worker, pelted, stone, looted, shop, th..."
8166411,Lesotho,Riots,Violent demonstration,0,"[june, worker, set, tire, fire, blocked, road,..."
8059405,Uganda,Violence against civilians,Attack,2,"[june, unidentified, armed, group, attacked, m..."
...,...,...,...,...,...
7121647,Nigeria,Riots,Mob violence,0,"[june, pdp, supporter, attacked, apc, supporte..."
7966971,Tunisia,Protests,Peaceful protest,0,"[june, aligned, health, worker, protested, fro..."
7121659,Somalia,Battles,Armed clash,0,"[june, al, shabaab, militant, launched, attack..."
7518716,South Africa,Protests,Peaceful protest,0,"[june, parent, demonstrated, outside, bergvill..."


In [88]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [89]:
df["notes"] = df["notes"].apply(lambda x: remove_stopwords(x))

In [98]:
df

Unnamed: 0_level_0,country,event_type,sub_event_type,fatalities,notes
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8166147,Democratic Republic of Congo,Battles,Armed clash,0,"[june, adf, attacked, military, base, village,..."
8166148,Sudan,Violence against civilians,Attack,5,"[june, three, masked, gunman, opened, fire, ci..."
8166410,Lesotho,Riots,Violent demonstration,0,"[june, worker, pelted, stone, looted, shop, th..."
8166411,Lesotho,Riots,Violent demonstration,0,"[june, worker, set, tire, fire, blocked, road,..."
8059405,Uganda,Violence against civilians,Attack,2,"[june, unidentified, armed, group, attacked, m..."
...,...,...,...,...,...
7121647,Nigeria,Riots,Mob violence,0,"[june, pdp, supporter, attacked, apc, supporte..."
7966971,Tunisia,Protests,Peaceful protest,0,"[june, aligned, health, worker, protested, fro..."
7121659,Somalia,Battles,Armed clash,0,"[june, al, shabaab, militant, launched, attack..."
7518716,South Africa,Protests,Peaceful protest,0,"[june, parent, demonstrated, outside, bergvill..."


In [100]:
def remove_month(text):
    dates = ['january', 'feburary', 'march','april','may','june','july','august','september','october','november','december']
    words =[w for w in text if w not in dates]
    return words

In [101]:
df["notes"] = df["notes"].apply(lambda x: remove_month(x))

In [104]:
df['notes']

data_id
8166147    [adf, attacked, military, base, village, kokol...
8166148    [three, masked, gunman, opened, fire, civilian...
8166410    [worker, pelted, stone, looted, shop, thetsane...
8166411    [worker, set, tire, fire, blocked, road, maput...
8059405    [unidentified, armed, group, attacked, ministe...
                                 ...                        
7121647    [pdp, supporter, attacked, apc, supporter, oso...
7966971    [aligned, health, worker, protested, front, re...
7121659    [al, shabaab, militant, launched, attack, juba...
7518716    [parent, demonstrated, outside, bergville, pri...
7518717    [demonstrator, mostly, woman, set, truck, alig...
Name: notes, Length: 33378, dtype: object

In [109]:
protests_df = df.loc[df['event_type'] == "Protests"]

In [110]:
protests_df

Unnamed: 0_level_0,country,event_type,sub_event_type,fatalities,notes
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8059413,Zambia,Protests,Peaceful protest,0,"[doctor, gathered, university, teaching, hospi..."
8059671,South Africa,Protests,Peaceful protest,0,"[group, blocked, north, south, melmoth, road, ..."
8059418,Morocco,Protests,Peaceful protest,0,"[member, national, union, unemployed, graduate..."
8059426,Tunisia,Protests,Peaceful protest,0,"[farmer, protested, beja, beja, nord, beja, de..."
8059437,Algeria,Protests,Peaceful protest,0,"[dozen, aadl, social, housing, program, subscr..."
...,...,...,...,...,...
7121353,Nigeria,Protests,Peaceful protest,0,"[people, umbrella, jac, demonstrated, federal,..."
7121613,Tunisia,Protests,Peaceful protest,0,"[group, contractual, teacher, public, school, ..."
7887842,Democratic Republic of Congo,Protests,Peaceful protest,0,"[resident, kashanje, village, near, mbuhi, pil..."
7966971,Tunisia,Protests,Peaceful protest,0,"[aligned, health, worker, protested, front, re..."


In [111]:
battles_df = df.loc[df['event_type'] == "Battles"]

In [112]:
battles_df

Unnamed: 0_level_0,country,event_type,sub_event_type,fatalities,notes
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8166147,Democratic Republic of Congo,Battles,Armed clash,0,"[adf, attacked, military, base, village, kokol..."
8059690,Somalia,Battles,Armed clash,1,"[two, group, government, police, force, clashe..."
8059708,Somalia,Battles,Armed clash,25,"[government, security, force, sna, clashed, al..."
8059742,Somalia,Battles,Armed clash,0,"[al, shabaab, militant, attacked, government, ..."
8059778,Somalia,Battles,Armed clash,0,"[al, shabaab, militant, carried, attack, gover..."
...,...,...,...,...,...
7887845,Democratic Republic of Congo,Battles,Armed clash,4,"[nyatura, militiaman, clashed, killed, policem..."
7887846,Democratic Republic of Congo,Battles,Armed clash,2,"[ndc, r, rebel, clashed, armed, men, apcls, ny..."
7887847,Democratic Republic of Congo,Battles,Armed clash,2,"[ndc, r, rebel, clashed, armed, men, apcls, ny..."
7887851,Democratic Republic of Congo,Battles,Armed clash,2,"[fardc, clashed, mayi, mayi, militiaman, bulon..."


In [113]:
protests_df.to_csv('step2_ACLED_Dataset_END_protests.csv')
battles_df.to_csv('step2_ACLED_Dataset_END_battles.csv')