# Analysis of ACLED and GDELT data

Mount Gdrive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
%cd gdrive/MyDrive/Twist3

/content/gdrive/MyDrive/Twist3


## Ignore future warnings

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Install pycountry

In [4]:
!pip install pycountry

Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 5.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: pycountry
  Building wheel for pycountry (PEP 517) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681845 sha256=33822b960059bf8f472b8d830954516acc4165e25f484af23cd9af004da5ffda
  Stored in directory: /root/.cache/pip/wheels/0e/06/e8/7ee176e95ea9a8a8c3b3afcb1869f20adbd42413d4611c6eb4
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-22.3.5


## Load libraries

In [5]:
import requests
import pandas as pd
import pycountry
import plotly.express as px
import numpy as np

import os
local_path = os.getcwd()

## Load ACLED data

In [6]:
df_ACLED = pd.read_csv(local_path+"/2019-04-09-2022-04-15.csv")

In [7]:
df_ACLED.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315968 entries, 0 to 315967
Data columns (total 31 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   data_id           315968 non-null  int64  
 1   iso               315968 non-null  int64  
 2   event_id_cnty     315968 non-null  object 
 3   event_id_no_cnty  315968 non-null  int64  
 4   event_date        315968 non-null  object 
 5   year              315968 non-null  int64  
 6   time_precision    315968 non-null  int64  
 7   event_type        315968 non-null  object 
 8   sub_event_type    315968 non-null  object 
 9   actor1            315968 non-null  object 
 10  assoc_actor_1     32562 non-null   object 
 11  inter1            315968 non-null  int64  
 12  actor2            264283 non-null  object 
 13  assoc_actor_2     73042 non-null   object 
 14  inter2            315968 non-null  int64  
 15  interaction       315968 non-null  int64  
 16  region            31

## Find incidents with more than 100 fatalities

In [8]:
df = df_ACLED.loc[df_ACLED['fatalities'] > 100]

## Create a datetime column

In [9]:
df['event_date_datetime'] = pd.to_datetime(df['event_date'], format='%d %B %Y')
df = df.sort_values(by=['event_date_datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 307561 to 2479
Data columns (total 32 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   data_id              56 non-null     int64         
 1   iso                  56 non-null     int64         
 2   event_id_cnty        56 non-null     object        
 3   event_id_no_cnty     56 non-null     int64         
 4   event_date           56 non-null     object        
 5   year                 56 non-null     int64         
 6   time_precision       56 non-null     int64         
 7   event_type           56 non-null     object        
 8   sub_event_type       56 non-null     object        
 9   actor1               56 non-null     object        
 10  assoc_actor_1        17 non-null     object        
 11  inter1               56 non-null     int64         
 12  actor2               56 non-null     object        
 13  assoc_actor_2        29 non-nu

In [11]:
df.head()

Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,...,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3,event_date_datetime
307561,8481094,4,AFG19510,19510,05 May 2019,2019,1,Explosions/Remote violence,Air/drone strike,Military Forces of Afghanistan (2014-2021),...,32.2414,62.9494,2,AP; Khaama Press; BBC News,National-International,"On 05-May-2019, 150 Taliban militants were kil...",165,1631067042,AFG,2019-05-05
295178,9061048,180,DRC14647,14647,12 June 2019,2019,1,Violence against civilians,Attack,Lendu Ethnic Militia (Democratic Republic of C...,...,1.669,30.5202,1,BBC News; Radio Okapi; Bunia Actualite; 24hcon...,New media-Subnational,"On 12 June 2019, in Tche, suspected Lendu Ethn...",140,1649875334,COD,2019-06-12
268926,8062217,887,YEM42592,42592,31 August 2019,2019,1,Explosions/Remote violence,Air/drone strike,Operation Restoring Hope,...,14.6149,44.3621,1,UN OHCHR; ICRC; Bellingcat; Yemen Data Project...,Local partner-National,"On 31 August 2019, 7 Saudi-led coalition airst...",134,1623104656,YEM,2019-08-31
236466,8020482,562,NIR1095,1095,10 December 2019,2019,1,Battles,Armed clash,Islamic State (West Africa) - Greater Sahara F...,...,15.2275,1.3135,1,Tamtam Info; Urgence Tillabery; Whatsapp; Jeun...,New media-National,"On 10 Decemper 2019, ISGS militants conducted ...",128,1622068114,NER,2019-12-10
228325,6719701,364,IRN5963,5963,08 January 2020,2020,1,Explosions/Remote violence,Shelling/artillery/missile attack,Military Forces of Iran (1989-) Islamic Revolu...,...,35.4719,50.9754,1,CNN,International,"On 8 January 2020, IRGC shot down a Ukraine In...",176,1618556816,IRN,2020-01-08


## Pickle for use in app

In [12]:
df.to_pickle(local_path+"/df_1.pkl")

## Plot of conflicts over time

In [13]:
fig = px.choropleth(df, locations="iso3",
                    color="fatalities",
                    hover_name="country",
                    animation_frame="event_date",
                    title = "Conflicts with more than 100 fatalities",
                    color_continuous_scale=px.colors.sequential.PuRd,
                    range_color = (df["fatalities"].min(),df["fatalities"].max()))
 
fig.show()

## Find the total fatalites for the countrie of interest

In [14]:
df_new = pd.DataFrame(np.array(df["iso3"].unique()),columns=['iso3'])

df_new['fatalities'] = 0
i = 0
for c in df["iso3"].unique():
  df_new['fatalities'][i] = df_ACLED.loc[df_ACLED['iso3']==c]["fatalities"].sum()
  i = i+1



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Pickle for use in app

In [15]:
df_new.to_pickle(local_path+"/df_2.pkl")

## Plot accummulated fatalities

In [16]:
fig = px.choropleth(df_new, locations="iso3",
                    color="fatalities",
                    title = "Accumulated fatalities for conflicts of interest",
                    color_continuous_scale=px.colors.sequential.PuRd,
                    range_color = (df_new["fatalities"].min(),df_new["fatalities"].max()))
 
fig.show()

## Find accumulated fatalites for all countries

In [17]:
df_new = pd.DataFrame(np.array(df_ACLED["iso3"].unique()),columns=['iso3'])

df_new['fatalities'] = 0
i = 0
for c in df_ACLED["iso3"].unique():
  df_new['fatalities'][i] = df_ACLED.loc[df_ACLED['iso3']==c]["fatalities"].sum()
  i = i+1



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Pickle for use in app

In [18]:
df_new.to_pickle(local_path+"/df_3.pkl")

## Plot accumulated fatalities

In [19]:
fig = px.choropleth(df_new, locations="iso3",
                    color="fatalities",
                    title = "Accumulated fatalities",
                    color_continuous_scale=px.colors.sequential.PuRd,
                    range_color = (df_new["fatalities"].min(),df_new["fatalities"].max()))
 
fig.show()

## Select coutries of interest

In [20]:
df_c = df_ACLED[df_ACLED.iso3.isin(df['iso3'].unique())]
df_c['event_date_datetime'] = pd.to_datetime(df_c['event_date'], format='%d %B %Y')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Pickle for later use in the app

In [21]:
df_c.to_pickle(local_path+"/df_4.pkl")

## Create a scatter plot of fatalities over time for coutries of interest

In [22]:
fig = px.scatter(df_c, x="event_date_datetime", y="fatalities", color="country",# symbol="event_type",
                 size='fatalities',
                  hover_data=['country','event_type'], labels=dict(event_date_datetime="Event date", fatalities="Fatalities", country="Country", event_type ="Event type"))

fig.show()

Output hidden; open in https://colab.research.google.com to view.

## Load GDELT data

Create a dataframe to record the average tone of articles as well as the average number of articles.

In [47]:
countries = countries = [ 'UKR','AFG', 'ETH','COD', 'YEM', 'NER', 'IRN', 'MLI', 'TCD', 'LBY', 'MOZ', 'BFA', 'NGA', 'CAF', 'SOM']

df_GDELT = pd.DataFrame(columns=['country','iso3', 'date', 'AvgTone', 'AccNumArticles'])

for c in countries:
  country = pycountry.countries.get(alpha_3=c)
  print(country,c,country.alpha_2)
  path = local_path+'/'+country.alpha_2

  dir_list = os.listdir(path)

  for d in dir_list: 
    df_temp =  pd.read_pickle(path+'/'+d)
    #df_temp['datetime'] = pd.to_datetime(df_temp['SQLDATE'], format='%Y%m%d')
    #df_temp = df_temp.loc[df_temp['datetime']==d.split('.')[0]]

    if (len(df_temp.index) == 0) == False:

      row = pd.Series([country.name, c, d.split('.')[0], df_temp['AvgTone'].mean(), df_temp['NumArticles'].sum()], index=df_GDELT.columns)

      df_GDELT = df_GDELT.append(row, ignore_index=True)




Country(alpha_2='UA', alpha_3='UKR', flag='🇺🇦', name='Ukraine', numeric='804') UKR UA
Country(alpha_2='AF', alpha_3='AFG', flag='🇦🇫', name='Afghanistan', numeric='004', official_name='Islamic Republic of Afghanistan') AFG AF
Country(alpha_2='ET', alpha_3='ETH', flag='🇪🇹', name='Ethiopia', numeric='231', official_name='Federal Democratic Republic of Ethiopia') ETH ET
Country(alpha_2='CD', alpha_3='COD', flag='🇨🇩', name='Congo, The Democratic Republic of the', numeric='180') COD CD
Country(alpha_2='YE', alpha_3='YEM', flag='🇾🇪', name='Yemen', numeric='887', official_name='Republic of Yemen') YEM YE
Country(alpha_2='NE', alpha_3='NER', flag='🇳🇪', name='Niger', numeric='562', official_name='Republic of the Niger') NER NE
Country(alpha_2='IR', alpha_3='IRN', flag='🇮🇷', name='Iran, Islamic Republic of', numeric='364', official_name='Islamic Republic of Iran') IRN IR
Country(alpha_2='ML', alpha_3='MLI', flag='🇲🇱', name='Mali', numeric='466', official_name='Republic of Mali') MLI ML
Country(al

In [48]:
df_GDELT.country.unique()

array(['Afghanistan', 'Ethiopia', 'Congo, The Democratic Republic of the',
       'Niger', 'Iran, Islamic Republic of', 'Mali', 'Chad', 'Libya',
       'Mozambique', 'Burkina Faso', 'Nigeria',
       'Central African Republic', 'Somalia'], dtype=object)

In [49]:
df_GDELT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4241 entries, 0 to 4240
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         4241 non-null   object 
 1   iso3            4241 non-null   object 
 2   date            4241 non-null   object 
 3   AvgTone         4241 non-null   float64
 4   AccNumArticles  4241 non-null   object 
dtypes: float64(1), object(4)
memory usage: 165.8+ KB


## Pickle for later use in app

In [50]:
df_GDELT.to_pickle(local_path+"/df_5.pkl")

## Create a scatter showing average tone and number of articles overtime

In [51]:
fig = px.scatter(df_GDELT, x="date", y="AccNumArticles", color="country",# symbol="event_type",size='AvgTone'
                 )

fig.show()

In [52]:
fig = px.scatter(df_GDELT, x="date", y="AvgTone", color="country",# symbol="event_type",size='AvgTone'
                 )

fig.show()