In [2]:
import pandas as pd
import numpy as np
import json 

Note: when cleaning up, import data as modules 

### Loading Data and Preprocessing

Load data

In [3]:
with open('../acled_data/acleddata_jan_aug.json') as f:
    data = json.load(f)

Convert to pandas df

In [4]:
df = pd.DataFrame(data)

##### Converting fatalities to numeric format

In [5]:
df["fatalities"] = pd.to_numeric(df["fatalities"], errors='coerce')


In [6]:
df["latitude"] = pd.to_numeric(df["latitude"], errors='coerce')
df["longitude"] = pd.to_numeric(df["longitude"], errors='coerce')

#### Wounded feature with SpaCy

In [7]:
#Extracting the number of wounded from the notes column and adding a new column 'wounded'
import spacy
from word2number import w2n

nlp = spacy.load('en_core_web_md')

def extract_wounded(text):
    doc = nlp(text)
    wounded_count = 0

    for token in doc:
        if token.text.lower() == 'wounded':
            for child in token.children:
                if child.pos_ == 'NUM':
                    try:
                        wounded_count += int(child.text)
                    except ValueError:
                        wounded_count += w2n.word_to_num(child.text)
            if wounded_count == 0:
                for ancestor in token.ancestors:
                    if ancestor.pos_ == 'NUM':
                        try:
                            wounded_count += int(ancestor.text)
                        except ValueError:
                            wounded_count += w2n.word_to_num(ancestor.text)
                        break

    return wounded_count

df['wounded'] = df['notes'].apply(extract_wounded)


In [8]:
# Step 1a: Parse event_date into a datetime format and extract month and year
df["event_date"] = pd.to_datetime(df["event_date"])
df["month"] = df["event_date"].dt.month
df["year"] = df["event_date"].dt.year


In [9]:
# Combining fatalities + wounded into casualties
df['casualties'] = df['fatalities']+df['wounded']

In [10]:
grouped_columns = ["admin1", "admin2", "admin3", "location", "latitude", "longitude", "event_date", "year", "month", "event_type", "civilian_targeting"]

# Aggregate data by location, date, and event_type
df_agg = df.groupby(grouped_columns).agg(
    num_events=pd.NamedAgg(column='event_type', aggfunc='size'),
    total_casualties=pd.NamedAgg(column='casualties', aggfunc='sum')
).reset_index()


#### Battlefront Proximity Feature

In [11]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Earth radius in kilometers
    earth_radius = 6371

    # Calculate the distance
    distance = earth_radius * c
    return distance


In [12]:
#Define coordinates of the the battlefront towns
bakhmut_coords = (48.5956, 37.9999)
soledar_coords = (48.6833, 38.0667)
avdiivka_coords = (48.1394, 37.7497)
vuhledar_coords = (48.7798, 37.2490)
robotyne_coords = (47.44992394238662, 35.83787190517212)
kupiansk_coords = (49.7160738622855, 37.596104878691285)

In [13]:
#apply the haversine function to the dataset to calculate 
#the distances to each town and find the minimum distance:
df_agg['distance_to_bakhmut'] = haversine(df_agg['latitude'], df_agg['longitude'], bakhmut_coords[0], bakhmut_coords[1])
df_agg['distance_to_soledar'] = haversine(df_agg['latitude'], df_agg['longitude'], soledar_coords[0], soledar_coords[1])
df_agg['distance_to_avdiivka'] = haversine(df_agg['latitude'], df_agg['longitude'], avdiivka_coords[0], avdiivka_coords[1])
df_agg['distance_to_vuhledar'] = haversine(df_agg['latitude'], df_agg['longitude'], vuhledar_coords[0], vuhledar_coords[1])
df_agg['distance_to_robotyne'] = haversine(df_agg['latitude'], df_agg['longitude'], robotyne_coords[0], robotyne_coords[1])
df_agg['distance_to_kupiansk'] = haversine(df_agg['latitude'], df_agg['longitude'], kupiansk_coords[0], kupiansk_coords[1])


df_agg['min_distance_to_battlefront'] = df_agg[['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar', 'distance_to_robotyne', 'distance_to_kupiansk']].min(axis=1)

# Drop the temporary distance columns
df_agg = df_agg.drop(columns=['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar', 'distance_to_robotyne', 'distance_to_kupiansk'])



#### One-Hot Encoding

In [14]:
# Encode civilian_targeting as a binary column
df_agg['civilian_targeting_encoded'] = df_agg['civilian_targeting'].apply(lambda x: 1 if x == 'Civilian targeting' else 0)

In [15]:
# One-hot encode event_type and incorporate civilian_targeting_encoded for 'Explosions/Remote violence'
df_agg['event_battles'] = (df_agg['event_type'] == 'Battles').astype(int)
df_agg['event_explosions'] = ((df_agg['event_type'] == 'Explosions/Remote violence') & (df_agg['civilian_targeting_encoded'] == 0)).astype(int)
df_agg['event_explosions_civilians'] = ((df_agg['event_type'] == 'Explosions/Remote violence') & (df_agg['civilian_targeting_encoded'] == 1)).astype(int)
df_agg['event_violence_civilians'] = (df_agg['event_type'] == 'Violence against civilians').astype(int)


### T-SNE Variant

Scaling

In [16]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

# Selecting the columns we want to input into t-SNE
features = ['num_events', 'total_casualties', 'event_battles', 'event_explosions', 'event_explosions_civilians', 'event_violence_civilians']

# Normalize the features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df_agg[features])

1-D t-SNE

In [None]:
# # Apply t-SNE. We use 1 component to obtain a single value for hazard.
# tsne = TSNE(n_components=1, random_state=42)
# df_agg['tsne_hazard'] = tsne.fit_transform(scaled_features)

2-D Embeddings from t-SNE

In [17]:
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(scaled_features)
df_agg['tsne_0'] = tsne_results[:, 0]
df_agg['tsne_1'] = tsne_results[:, 1]

Applying KDE

In [18]:
from scipy.stats import gaussian_kde

# KDE on t-SNE results
kde = gaussian_kde(np.vstack([df_agg['tsne_0'], df_agg['tsne_1']]))
density = kde(np.vstack([df_agg['tsne_0'], df_agg['tsne_1']]))

# Identify the densest point
densest_idx = np.argmax(density)
densest_point = (df_agg.iloc[densest_idx]['tsne_0'], df_agg.iloc[densest_idx]['tsne_1'])


Compute Densest Point

In [19]:
df_agg['distance_to_densest'] = np.sqrt((df_agg['tsne_0'] - densest_point[0])**2 + (df_agg['tsne_1'] - densest_point[1])**2)

Convert Distances into Scores

In [20]:
decay_factor = 0.05  # This is just an example value; adjust as needed
df_agg['tsne_distance_points'] = 50 * np.exp(-decay_factor * df_agg['distance_to_densest'])


Proximity based Score on 'min_distance_to_battlefront'

In [21]:
df_agg['distance_points'] = 100 * np.exp(-decay_factor * df_agg['min_distance_to_battlefront'])

Hazard Score

In [22]:
# Combine tsne_distance_points with distance_points using the weights you provided to obtain the final hazard_score.
weight_distance = 0.7  # This is the weight for the distance to the battlefront
weight_tsne = 0.3     # This is the weight for the t-SNE derived score

df_agg['hazard_score'] = (df_agg['distance_points'] * weight_distance) + (df_agg['tsne_distance_points'] * weight_tsne)

# Normalize the hazard_score
min_hazard = df_agg['hazard_score'].min()
max_hazard = df_agg['hazard_score'].max()

# Apply Min-Max scaling to adjust scores between 0 and 100
df_agg['hazard_score'] = ((df_agg['hazard_score'] - min_hazard) / (max_hazard - min_hazard)) * 100
df_agg['hazard_score'] = df_agg['hazard_score'].round(0)


In [23]:
df_agg.loc[df_agg['min_distance_to_battlefront'] <= 30, 'hazard_score'] = 100


In [24]:
df_agg.loc[df_agg['hazard_score'] < 30, 'hazard_score'] = 30


In [25]:
df_agg.loc[df_agg['event_type'] == 'Battles', 'hazard_score'] = 100


#### Hazard Score: Domain-based Variant

In [851]:
# # Keep the distance points calculation as is
# df_agg['distance_points'] = 50 * np.exp(-decay_factor * df_agg['min_distance_to_battlefront'])

# # Adjust the event type points calculation based on our new approach
# def event_points(row):
#     # Check for 'Explosions/Remote violence' with civilian targeting
#     if row['event_type'] == 'Explosions/Remote violence' and row['civilian_targeting_encoded'] == 1:
#         return 15  # As per your earlier score for 'Explosions with civilians'

#     # Check for 'Explosions/Remote violence' without civilian targeting
#     elif row['event_type'] == 'Explosions/Remote violence' and row['civilian_targeting_encoded'] == 0:
#         return 10  # As per your earlier score for 'Explosions'

#     # Check for 'Battles'
#     elif row['event_type'] == 'Battles':
#         return 40  # As per your earlier score for 'Battles'
    
#     # Check for 'Violence against civilians'
#     elif row['event_type'] == 'Violence against civilians':
#         return 25  # Assigning 25 as an arbitrary score for 'Violence against civilians', adjust as needed

#     # Otherwise, assume 'Others'
#     else:
#         return 5  # As per your earlier score for 'Others'

# df_agg['event_type_points'] = df_agg.apply(event_points, axis=1)

# # Keep the frequency points and casualty points calculations as they are
# df_agg['frequency_points'] = 20 * (df_agg['num_events'] / df_agg['num_events'].max())
# df_agg['casualty_points'] = 10 * (df_agg['total_casualties'] / df_agg['total_casualties'].max())

# # Final hazard score calculation
# df_agg['hazard_score'] = df_agg['distance_points'] + df_agg['event_type_points'] + df_agg['frequency_points'] + df_agg['casualty_points']

#### Hazard Level diffusion

Logic:

- Identify high hazard locations
- Function to check whether there's a row with a matching admin1 column. If a match is found compute the distance between the coordinates, record the distance between them. Iterate through all locations within that admin1 border and whenever we have a shorter distance we replace the old one. Also, record the hazard score of the location which has the shortest distance to our location. 
- If our row's admin1 isn't matching with any of the locations we skip the element and move to the next.
- This location will be incremented by a decayed amount of hazard level from the hot zone, depending on how far away it is. Percentage inheritance up to testing. 

In [28]:
# Step 1: Extract high hazard locations
high_hazard_locs = df_agg[df_agg['hazard_score'] >= 80][['admin1','latitude', 'longitude', 'hazard_score']]

high_hazard_locs = high_hazard_locs.drop_duplicates(subset=['latitude', 'longitude'], keep=False)

high_hazard_locs.reset_index(drop=True)
# Step 2: Identify locations within 50km radius using row-wise function
def compute_distance(row, locations):
    if row['admin1'] in locations['admin1'].values:
        # find the index of the first occurrence
        idx = locations[locations['admin1']==row['admin1']].index[0]
        # return i, locations.loc[i]['admin1'], row['admin1']
        for i, r in locations.loc[idx:].iterrows():
            if r['admin1'] == row['admin1']:
                continue 
            else:
                return 'Yabajaba'
    else:
        return 'Missed'

# Step 2.5: test row-wise matching

df_agg['found'] = df_agg.apply(lambda x: compute_distance(x, high_hazard_locs) if x['hazard_score']<80 else None, axis=1)
# df_agg['distance_high_hazard'] = df_agg.combine(high_hazard_locs, lambda x, y: compute_distance(x,y))

In [310]:
high_hazard_locs.head()

Unnamed: 0,admin1,latitude,longitude,hazard_score
924,Dnipropetrovsk,47.5712,34.3964,100.0
1953,Donetsk,48.6678,38.1247,100.0
2274,Donetsk,48.9207,38.0425,100.0
2456,Donetsk,48.8315,38.0547,100.0
2649,Donetsk,48.7001,38.0265,100.0


In [29]:
df_agg['found'].value_counts()

found
Yabajaba    10609
Missed       1022
Name: count, dtype: int64

Formatting data to handle inconsistencies with naming and missing values

In [172]:
df_agg.loc[df_agg['admin2']=='Kyiv', 'admin3'] = 'Kyiv'

In [173]:
df_agg.loc[df_agg['location']=='Kherson', 'admin2'] = 'Khersonskyi'

In [174]:
df_agg = df_agg[~df_agg['admin3'].str.strip().eq('')]


In [182]:
df_agg.to_csv('Hazards_latest.csv', index=False)

In [181]:
df_agg[df_agg['hazard_score']<30]

Unnamed: 0,admin1,admin2,admin3,location,latitude,longitude,event_date,year,month,event_type,...,event_explosions,event_explosions_civilians,event_violence_civilians,tsne_0,tsne_1,distance_to_densest,tsne_distance_points,distance_points,hazard_score,scaled_hazard_score


In [104]:
df_agg[df_agg['location']=='Robotyne']

Unnamed: 0,admin1,admin2,admin3,location,latitude,longitude,event_date,year,month,event_type,...,event_explosions,event_explosions_civilians,event_violence_civilians,tsne_0,tsne_1,distance_to_densest,tsne_distance_points,distance_points,hazard_score,scaled_hazard_score
26141,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-03-19,2023,3,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26142,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-03-22,2023,3,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26143,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-04-12,2023,4,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26144,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-04-18,2023,4,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26145,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-06-01,2023,6,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,48.789381,49.152568,100.0
26146,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-06-08,2023,6,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26147,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-06-09,2023,6,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26148,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-06-15,2023,6,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26149,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-06-17,2023,6,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0
26150,Zaporizhia,Polohivskyi,Tokmatska,Robotyne,47.4478,35.837,2023-06-18,2023,6,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,48.789381,34.152595,100.0


In [845]:
df_agg['hazard_score'].min()

17.0