In [2]:
import pandas as pd
import numpy as np
import json 

Note: when cleaning up, import data as modules 

### Loading Data and Preprocessing

Load data

In [99]:
with open('../acled_data/acleddata_jan_aug.json') as f:
    data = json.load(f)

Convert to pandas df

In [177]:
df = pd.DataFrame(data)

##### Converting fatalities to numeric format

In [178]:
df["fatalities"] = pd.to_numeric(df["fatalities"], errors='coerce')


In [179]:
df["latitude"] = pd.to_numeric(df["latitude"], errors='coerce')
df["longitude"] = pd.to_numeric(df["longitude"], errors='coerce')

#### Wounded feature with SpaCy

In [180]:
#Extracting the number of wounded from the notes column and adding a new column 'wounded'
import spacy
from word2number import w2n

nlp = spacy.load('en_core_web_md')

def extract_wounded(text):
    doc = nlp(text)
    wounded_count = 0

    for token in doc:
        if token.text.lower() == 'wounded':
            for child in token.children:
                if child.pos_ == 'NUM':
                    try:
                        wounded_count += int(child.text)
                    except ValueError:
                        wounded_count += w2n.word_to_num(child.text)
            if wounded_count == 0:
                for ancestor in token.ancestors:
                    if ancestor.pos_ == 'NUM':
                        try:
                            wounded_count += int(ancestor.text)
                        except ValueError:
                            wounded_count += w2n.word_to_num(ancestor.text)
                        break

    return wounded_count

df['wounded'] = df['notes'].apply(extract_wounded)


In [181]:
# Step 1a: Parse event_date into a datetime format and extract month and year
df["event_date"] = pd.to_datetime(df["event_date"])
df["month"] = df["event_date"].dt.month
df["year"] = df["event_date"].dt.year


In [182]:
# Combining fatalities + wounded into casualties
df['casualties'] = df['fatalities']+df['wounded']

In [294]:
grouped_columns = ["admin1", "admin2", "admin3", "location", "latitude", "longitude", "event_date", "year", "month", "event_type", "civilian_targeting"]

# Aggregate data by location, date, and event_type
df_agg = df.groupby(grouped_columns).agg(
    num_events=pd.NamedAgg(column='event_type', aggfunc='size'),
    total_casualties=pd.NamedAgg(column='casualties', aggfunc='sum')
).reset_index()


#### Battlefront Proximity Feature

I thought it would be a good idea to create a feature that will represent the proximity of locations to known hot zones where armed clashes occur, which in turn increases the chances of artillery shelling, drones, and missiles strikes affecting said areas. For this batch, I will limit the pool of hot zones to 4 locations: Bakhmut, Soledar, Avdiivka, Vuhledar. Those locations are known as battlefronts where frequent heavy armed clashes between opposing forces occur therefore anything close to them is very likely to be affected. 

In order to create this feature, I will compute the distance between locations in the data set to the hot zones by calculationg the distance between two sets of latitude and longitude points by using the Haversine formula.

In [217]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Earth radius in kilometers
    earth_radius = 6371

    # Calculate the distance
    distance = earth_radius * c
    return distance


In [218]:
#Define coordinates of the the battlefront towns
bakhmut_coords = (48.5956, 37.9999)
soledar_coords = (48.6833, 38.0667)
avdiivka_coords = (48.1394, 37.7497)
vuhledar_coords = (48.7798, 37.2490)
robotyne_coords = (47.44992394238662, 35.83787190517212)
kupiansk_coords = (49.7160738622855, 37.596104878691285)

In [296]:
#apply the haversine function to the dataset to calculate 
#the distances to each town and find the minimum distance:
df_agg['distance_to_bakhmut'] = haversine(df_agg['latitude'], df_agg['longitude'], bakhmut_coords[0], bakhmut_coords[1])
df_agg['distance_to_soledar'] = haversine(df_agg['latitude'], df_agg['longitude'], soledar_coords[0], soledar_coords[1])
df_agg['distance_to_avdiivka'] = haversine(df_agg['latitude'], df_agg['longitude'], avdiivka_coords[0], avdiivka_coords[1])
df_agg['distance_to_vuhledar'] = haversine(df_agg['latitude'], df_agg['longitude'], vuhledar_coords[0], vuhledar_coords[1])
df_agg['distance_to_robotyne'] = haversine(df_agg['latitude'], df_agg['longitude'], robotyne_coords[0], robotyne_coords[1])
df_agg['distance_to_kupiansk'] = haversine(df_agg['latitude'], df_agg['longitude'], kupiansk_coords[0], kupiansk_coords[1])


df_agg['min_distance_to_battlefront'] = df_agg[['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar', 'distance_to_robotyne', 'distance_to_kupiansk']].min(axis=1)

# Drop the temporary distance columns
df_agg = df_agg.drop(columns=['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar', 'distance_to_robotyne', 'distance_to_kupiansk'])



In order to capture the importance of proximity, we will experiment with distance transformation

In [297]:
df_agg['log_min_distance_to_battlefront'] = np.log(df_agg['min_distance_to_battlefront'] + 1)


In [287]:
decay_factor = 0.1  # This is just a starting point, you can adjust based on domain knowledge or experimentation.
df_agg['exp_decay_distance'] = np.exp(-decay_factor * df_agg['min_distance_to_battlefront'])


#### One-Hot Encoding

In [298]:
# Encode civilian_targeting as a binary column
df_agg['civilian_targeting_encoded'] = df_agg['civilian_targeting'].apply(lambda x: 1 if x == 'Civilian targeting' else 0)

In [299]:
# One-hot encoding for Battles
df_agg['event_type_battles'] = (df_agg['event_type'] == 'Battles').astype(int)

# One-hot encoding for Explosions/Remote violence without civilian targeting
df_agg['event_type_explosion'] = ((df_agg['event_type'] == 'Explosion/Remote violence') & (df_agg['civilian_targeting_encoded'] == 0)).astype(int)

# One-hot encoding for Explosions/Remote violence with civilian targeting
df_agg['event_type_explosion_civilians'] = ((df_agg['event_type'] == 'Explosion/Remote violence') & (df_agg['civilian_targeting_encoded'] == 1)).astype(int)

# One-hot encoding for Violence against civilians
df_agg['event_type_violence_civilians'] = (df_agg['event_type'] == 'Violence against civilians').astype(int)


#### Normalizing the data with StandardScaler

In [300]:
from sklearn.preprocessing import StandardScaler

# Define features to scale
features_to_scale = ['total_casualties', 'num_events', 'log_min_distance_to_battlefront']


# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
df_agg[features_to_scale] = scaler.fit_transform(df_agg[features_to_scale])


#### Hazard Score 

Now that we have reduced the engineered features into 'tsne_reduced' to capture each locations intensity, we will now calculate a holistic hazard score based on each locations minimal distance to the battlefront. 

- First we will convert the 'min_distance_to_battlefront' feature into a distance score using exponential decay.
- Then we will combine the 'tsne_reduced' values (which already range between 0 to 100) and the 'distance_score' using a weighted sum to create the final hazard score:

Due to unreported casualties, the current model suffers from memory loss. Hence, even if a location is close to the battlefield or is the battlefield, it might lose its hazard rating simply due to missing data in casualties. In order to fix this, we will apply a lingering effect, where a high casualty event will propagate its hazard score up to 2 weeks. The criteria for this effect is to have a min distance score of 80 (so that we ignore remote areas that suffer from a missile strike on civilians) and have a high casualty event occur within a week from the event in question.

In [301]:
from sklearn.decomposition import PCA

# Features for PCA
features_for_pca = [
    'total_casualties', 'num_events', 'log_min_distance_to_battlefront',  # or 'exp_decay_distance'
    'event_type_battles', 'event_type_explosion', 'event_type_explosion_civilians', 'event_type_violence_civilians'
]

pca = PCA(n_components=2)  # or another suitable number of components
principal_components = pca.fit_transform(df_agg[features_for_pca])

# Convert to a DataFrame for easier handling
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Optionally, calculate a hazard score based on the principal components.
# This is just a starting point; you might want to try different combinations of the PCs.
df_agg['hazard_score'] = df_pca['PC1'] + df_pca['PC2']

In [314]:
df_agg['hazard_score'] = ((df_agg['hazard_score'] - df_agg['hazard_score'].min()) / (df_agg['hazard_score'].max() - df_agg['hazard_score'].min()) * 100) *5

#### Score Propagation for Memory Loss Prevention

In [83]:
# Initialize a new column to track when a high casualty event took place
df_agg['high_casualty_event_date'] = np.nan

# Populate the high_casualty_event_date column for rows where casualties > 0
df_agg.loc[df_agg['casualties'] > 0, 'high_casualty_event_date'] = df_agg['event_date']

# Forward fill the high_casualty_event_date column for 14 days or until another casualty event
df_agg['high_casualty_event_date'] = df_agg.groupby('location')['high_casualty_event_date'].transform(lambda x: x.ffill(limit=14))

# For rows where casualties = 0 and are within 14 days of a high casualty event, and distance_score is high,
# set the hazard score to be the same as the previous day
mask = (df_agg['casualties'] == 0) & (~df_agg['high_casualty_event_date'].isna()) & (df_agg['distance_score'] >= 80)
df_agg.loc[mask, 'hazard_score'] = df_agg.groupby('location')['hazard_score'].shift(1)

# Drop the high_casualty_event_date column, it's no longer needed
df_agg = df_agg.drop(columns=['high_casualty_event_date'])


Formatting data to handle inconsistencies with naming and missing values

In [48]:
df_agg.loc[df_agg['admin2']=='Kyiv', 'admin3'] = 'Kyiv'

In [49]:
df_agg.loc[df_agg['location']=='Kherson', 'admin2'] = 'Khersonskyi'

In [50]:
df_agg = df_agg[~df_agg['admin3'].str.strip().eq('')]


In [51]:
df_agg.to_csv('Hazards_latest.csv', index=False)

In [316]:
df_agg[df_agg['location']=='Bakhmut'].tail()

Unnamed: 0,admin1,admin2,admin3,location,latitude,longitude,event_date,year,month,event_type,...,num_events,total_casualties,min_distance_to_battlefront,log_min_distance_to_battlefront,civilian_targeting_encoded,event_type_battles,event_type_explosion,event_type_explosion_civilians,event_type_violence_civilians,hazard_score
1299,Donetsk,Bakhmutskyi,Bakhmutska,Bakhmut,48.5956,37.9999,2023-07-24,2023,7,Battles,...,-0.059236,-0.146179,0.0,-2.939715,0,1,0,0,0,21.978394
1300,Donetsk,Bakhmutskyi,Bakhmutska,Bakhmut,48.5956,37.9999,2023-07-26,2023,7,Battles,...,-0.059236,-0.146179,0.0,-2.939715,0,1,0,0,0,21.978394
1301,Donetsk,Bakhmutskyi,Bakhmutska,Bakhmut,48.5956,37.9999,2023-07-27,2023,7,Battles,...,-0.059236,-0.146179,0.0,-2.939715,0,1,0,0,0,21.978394
1302,Donetsk,Bakhmutskyi,Bakhmutska,Bakhmut,48.5956,37.9999,2023-08-03,2023,8,Battles,...,-0.059236,-0.146179,0.0,-2.939715,0,1,0,0,0,21.978394
1303,Donetsk,Bakhmutskyi,Bakhmutska,Bakhmut,48.5956,37.9999,2023-08-04,2023,8,Battles,...,-0.059236,-0.146179,0.0,-2.939715,0,1,0,0,0,21.978394
