In [1]:
import pandas as pd
import numpy as np
import json 

Note: when cleaning up, import data as modules 

### Loading Data and Preprocessing

Load data

In [2]:
with open('../acled_data/acleddata_jan_aug.json') as f:
    data = json.load(f)

Convert to pandas df

In [3]:
df = pd.DataFrame(data)

##### Converting fatalities to numeric format

In [4]:
df["fatalities"] = pd.to_numeric(df["fatalities"], errors='coerce')


In [5]:
df["latitude"] = pd.to_numeric(df["latitude"], errors='coerce')
df["longitude"] = pd.to_numeric(df["longitude"], errors='coerce')

#### Wounded feature with SpaCy

In [6]:
#Extracting the number of wounded from the notes column and adding a new column 'wounded'
import spacy
from word2number import w2n

nlp = spacy.load('en_core_web_md')

def extract_wounded(text):
    doc = nlp(text)
    wounded_count = 0

    for token in doc:
        if token.text.lower() == 'wounded':
            for child in token.children:
                if child.pos_ == 'NUM':
                    try:
                        wounded_count += int(child.text)
                    except ValueError:
                        wounded_count += w2n.word_to_num(child.text)
            if wounded_count == 0:
                for ancestor in token.ancestors:
                    if ancestor.pos_ == 'NUM':
                        try:
                            wounded_count += int(ancestor.text)
                        except ValueError:
                            wounded_count += w2n.word_to_num(ancestor.text)
                        break

    return wounded_count

df['wounded'] = df['notes'].apply(extract_wounded)


In [7]:
# Step 1a: Parse event_date into a datetime format and extract month and year
df["event_date"] = pd.to_datetime(df["event_date"])
df["month"] = df["event_date"].dt.month
df["year"] = df["event_date"].dt.year


In [8]:
# Combining fatalities + wounded into casualties
df['casualties'] = df['fatalities']+df['wounded']

In [60]:
grouped_columns = ["admin1", "admin2", "admin3", "location", "latitude", "longitude", "event_date", "year", "month", "event_type", "civilian_targeting"]

# Aggregate data by location, date, and event_type
df_agg = df.groupby(grouped_columns).agg(
    num_events=pd.NamedAgg(column='event_type', aggfunc='size'),
    total_casualties=pd.NamedAgg(column='casualties', aggfunc='sum')
).reset_index()


#### Battlefront Proximity Feature

In [61]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Earth radius in kilometers
    earth_radius = 6371

    # Calculate the distance
    distance = earth_radius * c
    return distance


In [62]:
#Define coordinates of the the battlefront towns
bakhmut_coords = (48.5956, 37.9999)
soledar_coords = (48.6833, 38.0667)
avdiivka_coords = (48.1394, 37.7497)
vuhledar_coords = (48.7798, 37.2490)
robotyne_coords = (47.44992394238662, 35.83787190517212)
kupiansk_coords = (49.7160738622855, 37.596104878691285)

In [63]:
#apply the haversine function to the dataset to calculate 
#the distances to each town and find the minimum distance:
df_agg['distance_to_bakhmut'] = haversine(df_agg['latitude'], df_agg['longitude'], bakhmut_coords[0], bakhmut_coords[1])
df_agg['distance_to_soledar'] = haversine(df_agg['latitude'], df_agg['longitude'], soledar_coords[0], soledar_coords[1])
df_agg['distance_to_avdiivka'] = haversine(df_agg['latitude'], df_agg['longitude'], avdiivka_coords[0], avdiivka_coords[1])
df_agg['distance_to_vuhledar'] = haversine(df_agg['latitude'], df_agg['longitude'], vuhledar_coords[0], vuhledar_coords[1])
df_agg['distance_to_robotyne'] = haversine(df_agg['latitude'], df_agg['longitude'], robotyne_coords[0], robotyne_coords[1])
df_agg['distance_to_kupiansk'] = haversine(df_agg['latitude'], df_agg['longitude'], kupiansk_coords[0], kupiansk_coords[1])


df_agg['min_distance_to_battlefront'] = df_agg[['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar', 'distance_to_robotyne', 'distance_to_kupiansk']].min(axis=1)

# Drop the temporary distance columns
df_agg = df_agg.drop(columns=['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar', 'distance_to_robotyne', 'distance_to_kupiansk'])



#### One-Hot Encoding

In [64]:
# Encode civilian_targeting as a binary column
df_agg['civilian_targeting_encoded'] = df_agg['civilian_targeting'].apply(lambda x: 1 if x == 'Civilian targeting' else 0)

In [65]:
# One-hot encode event_type and incorporate civilian_targeting_encoded for 'Explosions/Remote violence'
df_agg['event_battles'] = (df_agg['event_type'] == 'Battles').astype(int)
df_agg['event_explosions'] = ((df_agg['event_type'] == 'Explosions/Remote violence') & (df_agg['civilian_targeting_encoded'] == 0)).astype(int)
df_agg['event_explosions_civilians'] = ((df_agg['event_type'] == 'Explosions/Remote violence') & (df_agg['civilian_targeting_encoded'] == 1)).astype(int)
df_agg['event_violence_civilians'] = (df_agg['event_type'] == 'Violence against civilians').astype(int)


### T-SNE Variant

Scaling

In [66]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE

# Selecting the columns we want to input into t-SNE
features = ['num_events', 'total_casualties', 'event_battles', 'event_explosions', 'event_explosions_civilians', 'event_violence_civilians']

# Normalize the features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df_agg[features])

1-D t-SNE

In [None]:
# # Apply t-SNE. We use 1 component to obtain a single value for hazard.
# tsne = TSNE(n_components=1, random_state=42)
# df_agg['tsne_hazard'] = tsne.fit_transform(scaled_features)

2-D Embeddings from t-SNE

In [67]:
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(scaled_features)
df_agg['tsne_0'] = tsne_results[:, 0]
df_agg['tsne_1'] = tsne_results[:, 1]

Applying KDE

In [68]:
from scipy.stats import gaussian_kde

# KDE on t-SNE results
kde = gaussian_kde(np.vstack([df_agg['tsne_0'], df_agg['tsne_1']]))
density = kde(np.vstack([df_agg['tsne_0'], df_agg['tsne_1']]))

# Identify the densest point
densest_idx = np.argmax(density)
densest_point = (df_agg.iloc[densest_idx]['tsne_0'], df_agg.iloc[densest_idx]['tsne_1'])


Compute Densest Point

In [69]:
df_agg['distance_to_densest'] = np.sqrt((df_agg['tsne_0'] - densest_point[0])**2 + (df_agg['tsne_1'] - densest_point[1])**2)

Convert Distances into Scores

In [70]:
decay_factor = 0.05  # This is just an example value; adjust as needed
df_agg['tsne_distance_points'] = 50 * np.exp(-decay_factor * df_agg['distance_to_densest'])


Proximity based Score on 'min_distance_to_battlefront'

In [71]:
df_agg['distance_points'] = 100 * np.exp(-decay_factor * df_agg['min_distance_to_battlefront'])

Hazard Score

In [72]:
# Combine tsne_distance_points with distance_points using the weights you provided to obtain the final hazard_score.
weight_distance = 0.65  # This is the weight for the distance to the battlefront
weight_tsne = 0.35      # This is the weight for the t-SNE derived score

df_agg['hazard_score'] = (df_agg['distance_points'] * weight_distance) + (df_agg['tsne_distance_points'] * weight_tsne)

# Normalize the hazard_score
min_hazard = df_agg['hazard_score'].min()
max_hazard = df_agg['hazard_score'].max()

# Apply Min-Max scaling to adjust scores between 0 and 100
df_agg['scaled_hazard_score'] = ((df_agg['hazard_score'] - min_hazard) / (max_hazard - min_hazard)) * 100
df_agg['scaled_hazard_score'] = df_agg['scaled_hazard_score'].round(0)


Old TSNE - without KDE

In [49]:
decay_factor = 0.1  # This is just an example value; adjust as needed
df_agg['distance_points'] = 50 * np.exp(-decay_factor * df_agg['min_distance_to_battlefront'])

# Rescale both features
scaler = MinMaxScaler(feature_range=(0, 100))
df_agg[['distance_points', 'tsne_hazard']] = scaler.fit_transform(df_agg[['distance_points', 'tsne_hazard']])

weight_distance = 0.65
weight_tsne = 0.35 #0.3

# Calculate weighted hazard score
df_agg['hazard_score'] = (df_agg['distance_points'] * weight_distance) + (df_agg['tsne_hazard'] * weight_tsne)


In [50]:
# Calculate min and max of the original hazard score
min_hazard = df_agg['hazard_score'].min()
max_hazard = df_agg['hazard_score'].max()

# Apply Min-Max scaling to adjust scores between 0 and 100
df_agg['scaled_hazard_score'] = ((df_agg['hazard_score'] - min_hazard) / (max_hazard - min_hazard)) * 100

df_agg['scaled_hazard_score'] = df_agg['scaled_hazard_score'].round(0)

#### Hazard Score: Domain-based Variant

In [851]:
# Keep the distance points calculation as is
df_agg['distance_points'] = 50 * np.exp(-decay_factor * df_agg['min_distance_to_battlefront'])

# Adjust the event type points calculation based on our new approach
def event_points(row):
    # Check for 'Explosions/Remote violence' with civilian targeting
    if row['event_type'] == 'Explosions/Remote violence' and row['civilian_targeting_encoded'] == 1:
        return 15  # As per your earlier score for 'Explosions with civilians'

    # Check for 'Explosions/Remote violence' without civilian targeting
    elif row['event_type'] == 'Explosions/Remote violence' and row['civilian_targeting_encoded'] == 0:
        return 10  # As per your earlier score for 'Explosions'

    # Check for 'Battles'
    elif row['event_type'] == 'Battles':
        return 40  # As per your earlier score for 'Battles'
    
    # Check for 'Violence against civilians'
    elif row['event_type'] == 'Violence against civilians':
        return 25  # Assigning 25 as an arbitrary score for 'Violence against civilians', adjust as needed

    # Otherwise, assume 'Others'
    else:
        return 5  # As per your earlier score for 'Others'

df_agg['event_type_points'] = df_agg.apply(event_points, axis=1)

# Keep the frequency points and casualty points calculations as they are
df_agg['frequency_points'] = 20 * (df_agg['num_events'] / df_agg['num_events'].max())
df_agg['casualty_points'] = 10 * (df_agg['total_casualties'] / df_agg['total_casualties'].max())

# Final hazard score calculation
df_agg['hazard_score'] = df_agg['distance_points'] + df_agg['event_type_points'] + df_agg['frequency_points'] + df_agg['casualty_points']

In [852]:
df_agg['hazard_score'] = df_agg['hazard_score'].clip(0, 100)


In [853]:
df_agg['hazard_score'] = df_agg['hazard_score'].round(0)


#### Hazard Level diffusion

In [854]:
# Step 1: Extract high hazard locations
high_hazard_locs = df_agg[df_agg['hazard_score'] >= 80][['latitude', 'longitude', 'hazard_score']].values

# Step 2: Function to compute distances
def compute_distances(lat, lon, high_hazard_locs):
    lat_diff = np.radians(high_hazard_locs[:, 0] - lat)
    lon_diff = np.radians(high_hazard_locs[:, 1] - lon)
    
    a = np.sin(lat_diff / 2.0)**2 + \
        np.cos(np.radians(lat)) * np.cos(np.radians(high_hazard_locs[:, 0])) * \
        np.sin(lon_diff / 2.0)**2
    
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distances = 6371.01 * c  # Convert to kilometers
    return distances

# Filter df_agg to get locations with hazard_score < 60
low_hazard_df = df_agg[df_agg['hazard_score'] < 70].copy()

# Generate distances matrix for locations with hazard score < 60
low_hazard_df['distances'] = low_hazard_df.apply(lambda row: compute_distances(row['latitude'], row['longitude'], high_hazard_locs), axis=1)

# Step 3: Function to compute inherited hazard
def inherited_hazard(distances, hazard_scores):
    max_distance = 80  # in km

    # Calculate decayed values based on distance: linear decay from 0.5 at 0km to 0 at 40km
    decayed_weights = 0.2 * (1 - (distances / max_distance))

    # Ensure that distances beyond 40km don't contribute
    decayed_weights[distances > max_distance] = 0
    
    # Multiply the decayed weights by the hazard scores to get the inherited values
    inherited_values = decayed_weights * hazard_scores

    return sum(inherited_values)

# Compute inherited hazard and update the final hazard score for locations with hazard score < 60
low_hazard_df['inherited_hazard_level'] = low_hazard_df['distances'].apply(lambda d: inherited_hazard(d, high_hazard_locs[:, 2]))
low_hazard_df['hazard_score'] += low_hazard_df['inherited_hazard_level']
low_hazard_df['hazard_score'] = low_hazard_df['hazard_score'].clip(0, 100)

# Update the original df_agg with updated hazard scores for locations with hazard score < 60
df_agg.update(low_hazard_df)


Formatting data to handle inconsistencies with naming and missing values

In [855]:
df_agg.loc[df_agg['admin2']=='Kyiv', 'admin3'] = 'Kyiv'

In [856]:
df_agg.loc[df_agg['location']=='Kherson', 'admin2'] = 'Khersonskyi'

In [857]:
df_agg = df_agg[~df_agg['admin3'].str.strip().eq('')]


In [858]:
df_agg.to_csv('Hazards_latest.csv', index=False)

In [74]:
df_agg[df_agg['scaled_hazard_score']>30]

Unnamed: 0,admin1,admin2,admin3,location,latitude,longitude,event_date,year,month,event_type,...,event_explosions,event_explosions_civilians,event_violence_civilians,tsne_0,tsne_1,distance_to_densest,tsne_distance_points,distance_points,hazard_score,scaled_hazard_score
1066,Donetsk,Bakhmutskyi,Bakhmutska,Andriivka,48.5008,37.9680,2023-01-02,2023,1,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.000000,50.000000,58.275851,55.379303,67.0
1067,Donetsk,Bakhmutskyi,Bakhmutska,Andriivka,48.5008,37.9680,2023-05-29,2023,5,Battles,...,0,0,0,182.456253,6.856316,261.328979,0.000106,58.275851,37.879340,46.0
1068,Donetsk,Bakhmutskyi,Bakhmutska,Andriivka,48.5008,37.9680,2023-06-13,2023,6,Battles,...,0,0,0,182.456253,6.856316,261.328979,0.000106,58.275851,37.879340,46.0
1069,Donetsk,Bakhmutskyi,Bakhmutska,Andriivka,48.5008,37.9680,2023-07-08,2023,7,Battles,...,0,0,0,182.456253,6.856316,261.328979,0.000106,58.275851,37.879340,46.0
1070,Donetsk,Bakhmutskyi,Bakhmutska,Andriivka,48.5008,37.9680,2023-07-09,2023,7,Battles,...,0,0,0,182.456253,6.856316,261.328979,0.000106,58.275851,37.879340,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26846,Zaporizhia,Zaporizkyi,Tavriiska,Yurkivka,47.6545,35.6550,2023-02-01,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.000000,50.000000,26.491227,34.719297,42.0
26847,Zaporizhia,Zaporizkyi,Tavriiska,Yurkivka,47.6545,35.6550,2023-02-02,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.000000,50.000000,26.491227,34.719297,42.0
26848,Zaporizhia,Zaporizkyi,Tavriiska,Yurkivka,47.6545,35.6550,2023-03-13,2023,3,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.000000,50.000000,26.491227,34.719297,42.0
26849,Zaporizhia,Zaporizkyi,Tavriiska,Yurkivka,47.6545,35.6550,2023-05-03,2023,5,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.000000,50.000000,26.491227,34.719297,42.0


In [78]:
df_agg[df_agg['location']=='Mariupol']

Unnamed: 0,admin1,admin2,admin3,location,latitude,longitude,event_date,year,month,event_type,...,event_explosions,event_explosions_civilians,event_violence_civilians,tsne_0,tsne_1,distance_to_densest,tsne_distance_points,distance_points,hazard_score,scaled_hazard_score
7851,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-01-20,2023,1,Explosions/Remote violence,...,1,0,0,16.026562,-286.081512,295.733185,1.893239e-05,0.350733,0.227983,0.0
7852,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-01-24,2023,1,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,0.350733,17.727976,21.0
7853,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-02,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,0.350733,17.727976,21.0
7854,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-06,2023,2,Battles,...,0,0,0,184.300934,-21.210842,263.309448,9.578064e-05,0.350733,0.22801,0.0
7855,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-07,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,0.350733,17.727976,21.0
7856,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-21,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,0.350733,17.727976,21.0
7857,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-22,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,0.350733,17.727976,21.0
7858,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-23,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,0.350733,17.727976,21.0
7859,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-24,2023,2,Explosions/Remote violence,...,1,0,0,-78.562019,-5.883148,0.0,50.0,0.350733,17.727976,21.0
7860,Donetsk,Mariupolskyi,Mariupolska,Mariupol,47.1298,37.571,2023-02-25,2023,2,Explosions/Remote violence,...,1,0,0,120.005539,-302.297302,356.777832,8.946181e-07,0.350733,0.227977,0.0


In [845]:
df_agg['hazard_score'].min()

17.0