# Importing necessary Libraries

In [1]:
import pandas as pd

In [2]:
# Read the ACLED dataabs
acled = pd.read_csv("ACLED Data_2025-10-13.csv")

In [3]:
acled.head()

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,tags,timestamp
0,USA23310,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Health Workers (United States),Protesters,...,Cleveland,41.4822,-81.6697,1,WKYC Studios; Crowd Counting Consortium,Other-Subnational,"On 1 January 2020, an unknown number of people...",0,crowd size=no report,1612546518
1,USA23416,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,Rochester,43.1547,-77.6155,1,13WHAM ABC; Democrat and Chronicle; 10NBC,Subnational,"On 1 January 2020, people held a peace march o...",0,crowd size=no report,1612546518
2,USA23636,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Government of the United States (2017-2021),Protesters,...,Phoenix,33.4492,-112.0741,1,Count Love; Channel 12 (Mesa),Other-Subnational,"On 1 January 2020, a group of people gathered ...",0,crowd size=no report,1612546518
3,USA23414,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),,Protesters,...,New York - Manhattan,40.7834,-73.9663,1,CBS2 (New York),Subnational,"On 1 January 2020, protesters gathered at NYPD...",0,crowd size=no report,1753970861
4,USA23635,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),BLM: Black Lives Matter,Protesters,...,Oakland,37.8044,-122.2711,1,KTVU Fox2,Subnational,"On 1 January 2020, a group of people gathered ...",0,crowd size=no report,1612546519


# Checking the shape of the dataset

In [4]:
acled.shape

(89474, 31)

# Cleaning the data for analysis

In [5]:
acled.isna().sum()

event_id_cnty             0
event_date                0
year                      0
time_precision            0
disorder_type             0
event_type                0
sub_event_type            0
actor1                    0
assoc_actor_1         22771
inter1                    0
actor2                81310
assoc_actor_2         86186
inter2                81310
interaction               0
civilian_targeting    88562
iso                       0
region                    0
country                   0
admin1                    0
admin2                  101
admin3                89474
location                  0
latitude                  0
longitude                 0
geo_precision             0
source                    0
source_scale              0
notes                     0
fatalities                0
tags                   4723
timestamp                 0
dtype: int64

# Dropping the tag field because it is flaky

In [6]:
acled = acled.drop(columns=["tags"])

# Dropping fields with > 20000 missing values

In [7]:
acled = acled.drop(columns=acled.columns[acled.isna().sum() > 20000])

#  Dropping rows where the admin2 field is empty

In [8]:
acled = acled[acled['admin2'].notna()]

# Ensuring that there are no missing values

In [9]:
acled.isna().sum()

event_id_cnty     0
event_date        0
year              0
time_precision    0
disorder_type     0
event_type        0
sub_event_type    0
actor1            0
inter1            0
interaction       0
iso               0
region            0
country           0
admin1            0
admin2            0
location          0
latitude          0
longitude         0
geo_precision     0
source            0
source_scale      0
notes             0
fatalities        0
timestamp         0
dtype: int64

# Shape of after removing null values

In [10]:
acled.shape

(89373, 24)

# Datatype of each variable in the dataset

In [11]:
acled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89373 entries, 0 to 89473
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   event_id_cnty   89373 non-null  object 
 1   event_date      89373 non-null  object 
 2   year            89373 non-null  int64  
 3   time_precision  89373 non-null  int64  
 4   disorder_type   89373 non-null  object 
 5   event_type      89373 non-null  object 
 6   sub_event_type  89373 non-null  object 
 7   actor1          89373 non-null  object 
 8   inter1          89373 non-null  object 
 9   interaction     89373 non-null  object 
 10  iso             89373 non-null  int64  
 11  region          89373 non-null  object 
 12  country         89373 non-null  object 
 13  admin1          89373 non-null  object 
 14  admin2          89373 non-null  object 
 15  location        89373 non-null  object 
 16  latitude        89373 non-null  float64
 17  longitude       89373 non-null  floa

### This block standardizes the ACLED DataFrame acled to analysis-ready dtypes. It converts text-like fields — event_id_cnty, disorder_type, event_type, sub_event_type, actor1, inter1, interaction, region, country, admin1, admin2, location, source, source_scale, and notes — to pandas string. It parses event_date to datetime64[ns] and turns year into a proper date (YYYY-01-01) so it behaves like a datetime. It casts time_precision, iso, geo_precision, fatalities, and timestamp to int64, and latitude and longitude to float64. During parsing it uses errors='coerce' so any bad values become NaT/NaN before the final 64-bit numeric casts. 

In [12]:
to_str = [
    "event_id_cnty", "disorder_type", "event_type", "sub_event_type",
    "actor1", "inter1", "interaction", "region", "country",
    "admin1", "admin2", "location", "source", "source_scale", "notes"
]

acled[to_str] = acled[to_str].apply(lambda s: s.astype("string"))

In [13]:
acled["event_date"] = pd.to_datetime(acled["event_date"], errors="coerce")
acled["year"] = pd.to_numeric(acled["year"], errors="coerce").astype("int64")

In [14]:
for c in ["time_precision", "iso", "geo_precision", "fatalities", "timestamp"]:
    acled[c] = pd.to_numeric(acled[c], errors="coerce").astype("int64")

In [15]:
for c in ["latitude", "longitude"]:
    acled[c] = pd.to_numeric(acled[c], errors="coerce").astype("float64")

In [16]:
acled.head()

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,inter1,interaction,...,admin2,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp
0,USA23310,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Protesters,Protesters only,...,Cuyahoga,Cleveland,41.4822,-81.6697,1,WKYC Studios; Crowd Counting Consortium,Other-Subnational,"On 1 January 2020, an unknown number of people...",0,1612546518
1,USA23416,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Protesters,Protesters only,...,Monroe,Rochester,43.1547,-77.6155,1,13WHAM ABC; Democrat and Chronicle; 10NBC,Subnational,"On 1 January 2020, people held a peace march o...",0,1612546518
2,USA23636,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Protesters,Protesters only,...,Maricopa,Phoenix,33.4492,-112.0741,1,Count Love; Channel 12 (Mesa),Other-Subnational,"On 1 January 2020, a group of people gathered ...",0,1612546518
3,USA23414,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Protesters,Protesters only,...,New York,New York - Manhattan,40.7834,-73.9663,1,CBS2 (New York),Subnational,"On 1 January 2020, protesters gathered at NYPD...",0,1753970861
4,USA23635,2020-01-01,2020,1,Demonstrations,Protests,Peaceful protest,Protesters (United States),Protesters,Protesters only,...,Alameda,Oakland,37.8044,-122.2711,1,KTVU Fox2,Subnational,"On 1 January 2020, a group of people gathered ...",0,1612546519


In [17]:
acled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89373 entries, 0 to 89473
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   event_id_cnty   89373 non-null  string        
 1   event_date      89373 non-null  datetime64[ns]
 2   year            89373 non-null  int64         
 3   time_precision  89373 non-null  int64         
 4   disorder_type   89373 non-null  string        
 5   event_type      89373 non-null  string        
 6   sub_event_type  89373 non-null  string        
 7   actor1          89373 non-null  string        
 8   inter1          89373 non-null  string        
 9   interaction     89373 non-null  string        
 10  iso             89373 non-null  int64         
 11  region          89373 non-null  string        
 12  country         89373 non-null  string        
 13  admin1          89373 non-null  string        
 14  admin2          89373 non-null  string        
 15  locatio

# Saving the cleaned data 

In [21]:
acled.to_csv("acled_clean.csv", index=False)