In [2]:
import pandas as pd
import numpy as np
import json 

Note: when cleaning up, import data as modules 

### Loading Data and Preprocessing

Load data

In [3]:
with open('../acled_data/acleddata_jan_july.json') as f:
    data = json.load(f)

Convert to pandas df

In [35]:
df = pd.DataFrame(data)

##### Converting fatalities to numeric format

In [36]:
df["fatalities"] = pd.to_numeric(df["fatalities"], errors='coerce')


In [37]:
df["latitude"] = pd.to_numeric(df["latitude"], errors='coerce')
df["longitude"] = pd.to_numeric(df["longitude"], errors='coerce')

Using spacy to extract data on wounded casualties from the 'notes' column and making a new feature - 'wounded' 

In [38]:
#Extracting the number of wounded from the notes column and adding a new column 'wounded'
import spacy
from word2number import w2n

nlp = spacy.load('en_core_web_md')

def extract_wounded(text):
    doc = nlp(text)
    wounded_count = 0

    for token in doc:
        if token.text.lower() == 'wounded':
            for child in token.children:
                if child.pos_ == 'NUM':
                    try:
                        wounded_count += int(child.text)
                    except ValueError:
                        wounded_count += w2n.word_to_num(child.text)
            if wounded_count == 0:
                for ancestor in token.ancestors:
                    if ancestor.pos_ == 'NUM':
                        try:
                            wounded_count += int(ancestor.text)
                        except ValueError:
                            wounded_count += w2n.word_to_num(ancestor.text)
                        break

    return wounded_count

df['wounded'] = df['notes'].apply(extract_wounded)


In [39]:
# Step 1a: Parse event_date into a datetime format and extract month and year
df["event_date"] = pd.to_datetime(df["event_date"])
df["month"] = df["event_date"].dt.month
df["year"] = df["event_date"].dt.year


In [40]:
#Combining fatalities + wounded into casualties
df['casualties'] = df['fatalities']+df['wounded']

In [41]:
# Step 1c: Aggregate data by location and month
# For example, we can sum fatalities and count the number of events for each location and month
grouped_columns = ["admin1", "admin2", "admin3", "location", "latitude", "longitude", "event_date", "year", "month"]
df_agg = df.groupby(grouped_columns).agg({"casualties": "sum", "event_type": "count"}).reset_index()
df_agg = df_agg.rename(columns={"event_type": "num_events"})


I thought it would be a good idea to create a feature that will represent the proximity of locations to known hot zones where armed clashes occur, which in turn increases the chances of artillery shelling, drones, and missiles strikes affecting said areas. For this batch, I will limit the pool of hot zones to 4 locations: Bakhmut, Soledar, Avdiivka, Vuhledar. Those locations are known as battlefronts where frequent heavy armed clashes between opposing forces occur therefore anything close to them is very likely to be affected. 

In order to create this feature, I will compute the distance between locations in the data set to the hot zones by calculationg the distance between two sets of latitude and longitude points by using the Haversine formula.

In [43]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Earth radius in kilometers
    earth_radius = 6371

    # Calculate the distance
    distance = earth_radius * c
    return distance


In [45]:
#Define coordinates of the the battlefront towns
bakhmut_coords = (48.5956, 37.9999)
soledar_coords = (48.6833, 38.0667)
avdiivka_coords = (48.1394, 37.7497)
vuhledar_coords = (48.7798, 37.2490)

In [46]:
#apply the haversine function to the dataset to calculate 
#the distances to each town and find the minimum distance:
df_agg['distance_to_bakhmut'] = haversine(df_agg['latitude'], df_agg['longitude'], bakhmut_coords[0], bakhmut_coords[1])
df_agg['distance_to_soledar'] = haversine(df_agg['latitude'], df_agg['longitude'], soledar_coords[0], soledar_coords[1])
df_agg['distance_to_avdiivka'] = haversine(df_agg['latitude'], df_agg['longitude'], avdiivka_coords[0], avdiivka_coords[1])
df_agg['distance_to_vuhledar'] = haversine(df_agg['latitude'], df_agg['longitude'], vuhledar_coords[0], vuhledar_coords[1])

df_agg['min_distance_to_battlefront'] = df_agg[['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar']].min(axis=1)

# Drop the temporary distance columns
df_agg = df_agg.drop(columns=['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar'])



Normalizing the data with StandardScaler

In [47]:
from sklearn.preprocessing import StandardScaler

In [48]:
# Normalize the variables
scaler = StandardScaler()
normalized_df_agg = scaler.fit_transform(df_agg[['casualties', 'num_events', 'min_distance_to_battlefront']])

Old hazard score feature

In [20]:
def calculate_hazard_score(row):
    # Set the distance threshold (in km)
    distance_threshold = 100
    # Set the event count threshold
    event_threshold = 10
    
    # Transform the distance variable using a nonlinear function
    transformed_distance = np.exp(-row['min_distance_to_battlefront'] / 10)
    
    # Normalize fatalities, num_events, and transformed_distance
    max_casualties = df_agg['casualties'].max()
    max_num_events = df_agg['num_events'].max()
    max_transformed_distance = df_agg['min_distance_to_battlefront'].apply(lambda x: np.exp(-x / 10)).max()
    
    normalized_casualties = row['casualties'] / max_casualties
    normalized_num_events = row['num_events'] / max_num_events
    normalized_transformed_distance = transformed_distance / max_transformed_distance
    
    # Define the weights for each variable
    casualties_weight = 10
    num_events_weight = 40
    transformed_distance_weight = 50
    
    # Set the core hazard score based on the thresholds
    core_hazard_score = 0
    if row['min_distance_to_battlefront'] <= distance_threshold:
        core_hazard_score += 60
        if row['num_events'] >= event_threshold:
            core_hazard_score += 20

    elif row['num_events'] >= event_threshold:
        core_hazard_score += 45
    
    elif row['num_events'] >= 5:
        core_hazard_score += 35
    
    elif row['num_events'] >= 1 and row['num_events']<5:
        core_hazard_score += 20

    if row['min_distance_to_battlefront'] <= 200 and row['min_distance_to_battlefront']>distance_threshold:
        core_hazard_score += 30
        
    
    
    # Calculate the hazard score based on the core score and the weighted normalized values
    hazard_score = (
        core_hazard_score +
        normalized_casualties * casualties_weight +
        normalized_num_events * num_events_weight +
        normalized_transformed_distance * transformed_distance_weight
    )
    
    # Ensure the hazard score does not exceed 100
    hazard_score = min(hazard_score, 100)
    
    return hazard_score


In [None]:
hazard_scores = {}
for index, row in df_agg.iterrows():
    location = row['location']
    hazard_score = calculate_hazard_score(row)
    hazard_scores[location] = calculate_hazard_score(row)
    df_agg.loc[index, 'hazard_score']=hazard_score

Hazard score feature PCA variant

In [24]:
from sklearn.decomposition import PCA

In [25]:
# Perform PCA
pca = PCA(n_components=1)
pca.fit(normalized_df_agg)

# Calculate the hazard scores using the first principal component
hazard_scores = pca.transform(normalized_df_agg)

# Scale the hazard scores to a range of 0 to 100
scaled_hazard_scores = (hazard_scores - hazard_scores.min()) / (hazard_scores.max() - hazard_scores.min()) * 100

# Add the scaled hazard scores to the original dataframe
df_agg['hazard_score_pca'] = scaled_hazard_scores

# Create a dictionary of hazard scores for each location
hazard_scores_pca = dict(zip(df_agg['location'], df_agg['hazard_score_pca']))

Trying out multiple PCA components

In [26]:
n_components = 2  # or any other number you want to use

# Normalize the variables
scaler = StandardScaler()
normalized_df_agg = scaler.fit_transform(df_agg[['casualties', 'num_events', 'min_distance_to_battlefront']])

# Perform PCA
pca = PCA(n_components=n_components)
pca.fit(normalized_df_agg)

# Calculate the hazard scores using the first principal component
hazard_scores = pca.transform(normalized_df_agg)

#Compute weighted hazard scores
weighted_hazard_scores = np.dot(hazard_scores, pca.explained_variance_ratio_)


# Scale the hazard scores to a range of 0 to 100
scaled_weighted_hazard_scores = (weighted_hazard_scores - weighted_hazard_scores.min()) / (weighted_hazard_scores.max() - weighted_hazard_scores.min()) * 100

# Add the scaled hazard scores to the original dataframe
df_agg['hazard_score_pca'] = scaled_weighted_hazard_scores

# Create a dictionary of hazard scores for each location
hazard_scores_pca = dict(zip(df_agg['location'], df_agg['hazard_score_pca']))

In [27]:
def pca_hazard_score(row):
    distance_threshold = 100
    event_threshold = 10

    core_hazard_score = 0
    
    # Apply adjustments based on distance to the battlefront
    if row['min_distance_to_battlefront'] <= distance_threshold:
        core_hazard_score += 60
        if row['num_events'] >= event_threshold:
            core_hazard_score += 20
    elif row['min_distance_to_battlefront'] <= 200:
        core_hazard_score += 30
    
    # Apply adjustments based on the number of events
    if row['num_events'] >= event_threshold:
        core_hazard_score += 45
    elif row['num_events'] >= 5:
        core_hazard_score += 35
    elif row['num_events'] >= 1:
        core_hazard_score += 20

    hazard_score = row['hazard_score_pca'] + core_hazard_score
    hazard_score = min(hazard_score, 100)
    
    return hazard_score

# Apply the adjustments to the PCA-based hazard scores
df_agg['pca_hazard_score'] = df_agg.apply(pca_hazard_score, axis=1)

df_agg = df_agg.drop('hazard_score_pca', axis=1)



In [None]:
# Check the explained variance ratio for the first principal component
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained variance ratio:", explained_variance_ratio)


Hazard score TSNE variant - currently selected model for hazard score modelling

In [49]:
from sklearn.manifold import TSNE

# Perform t-SNE
tsne = TSNE(n_components=1, random_state=42)
embedded_data = tsne.fit_transform(normalized_df_agg)

# Scale the t-SNE embeddings to a range of 0 to 100
scaled_embedded_data = (embedded_data - embedded_data.min()) / (embedded_data.max() - embedded_data.min()) * 100

df_agg['hazard_score_tsne'] = scaled_embedded_data


Correlations

In [35]:
# Calculate the correlation between the adjusted PCA hazard scores and the weighted sum hazard scores
correlation = df_agg['pca_hazard_score'].corr(df_agg['hazard_score'])
print("Correlation between PCA and weighted sum hazard scores:", correlation)


Correlation between PCA and weighted sum hazard scores: 0.9338408991790772


In [29]:
# Calculate the correlation between the adjusted PCA hazard scores and the tsne hazard scores
correlation = df_agg['pca_hazard_score'].corr(df_agg['hazard_score_tsne'])
print("Correlation between PCA and tsne hazard scores:", correlation)


Correlation between PCA and tsne hazard scores: 0.8287615694458589


In [37]:
# Calculate the correlation between the adjusted tsne hazard scores and the weighted sum hazard scores
correlation = df_agg['hazard_score_tsne'].corr(df_agg['hazard_score'])
print("Correlation between tsne and weighted sum hazard scores:", correlation)


Correlation between tsne and weighted sum hazard scores: 0.886348335529343


Formatting data to handle inconsistencies with naming and missing values

In [51]:
df_agg.loc[df_agg['admin2']=='Kyiv', 'admin3'] = 'Kyiv'

In [52]:
df_agg.loc[df_agg['location']=='Kherson', 'admin2'] = 'Khersonskyi'

In [53]:
df_agg = df_agg[~df_agg['admin3'].str.strip().eq('')]


In [55]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23281 entries, 0 to 23287
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   admin1                       23281 non-null  object        
 1   admin2                       23281 non-null  object        
 2   admin3                       23281 non-null  object        
 3   location                     23281 non-null  object        
 4   latitude                     23281 non-null  float64       
 5   longitude                    23281 non-null  float64       
 6   event_date                   23281 non-null  datetime64[ns]
 7   year                         23281 non-null  int32         
 8   month                        23281 non-null  int32         
 9   casualties                   23281 non-null  int64         
 10  num_events                   23281 non-null  int64         
 11  min_distance_to_battlefront  23281 non-null  f

In [56]:
df_agg.to_csv('Hazards_latest.csv', index=False)