In [78]:
import pandas as pd
import numpy as np

### Loading Data and Preprocessing

In [132]:
import json 
with open('../Data/acleddata.json') as f:
    data = json.load(f)

In [133]:
df = pd.DataFrame.from_dict(data['data'])

##### Converting fatalities to numeric format

In [134]:
df["fatalities"] = pd.to_numeric(df["fatalities"], errors='coerce')


In [135]:
df["latitude"] = pd.to_numeric(df["latitude"], errors='coerce')
df["longitude"] = pd.to_numeric(df["longitude"], errors='coerce')

In [136]:
print(df.describe())


          latitude    longitude  fatalities
count  5000.000000  5000.000000  5000.00000
mean     48.601524    36.180707     0.59240
std       1.450057     1.921444     4.18571
min      44.612100    23.382500     0.00000
25%      47.618925    34.657400     0.00000
50%      48.139350    36.840100     0.00000
75%      49.710600    37.749700     0.00000
max      52.337900    39.371200    97.00000


##### Preprocessing

In [137]:
# Step 1a: Parse event_date into a datetime format and extract month and year
df["event_date"] = pd.to_datetime(df["event_date"])
df["month"] = df["event_date"].dt.month
df["year"] = df["event_date"].dt.year

# Step 1b: Encode categorical variables
# For simplicity, we'll use one-hot encoding for all categorical variables.
# You can experiment with other encoding techniques as well.
categorical_columns = ["event_type", "sub_event_type", "actor1", "actor2", "admin1", "admin2", "admin3"]
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [138]:
# Step 1c: Aggregate data by location and month
# For example, we can sum fatalities and count the number of events for each location and month
# Aggregate data by location and month
grouped_columns = ["location", "latitude", "longitude", "year", "month"]
df_agg = df.groupby(["location", "latitude", "longitude", "year", "month"])["fatalities"].sum().reset_index()
df_agg = df_agg.rename(columns={"event_date": "num_events"})

# Aggregate the one-hot encoded categorical columns
df_cat_agg = df[grouped_columns + [col for col in df.columns if any(c in col for c in categorical_columns)]].groupby(grouped_columns).sum().reset_index()

# Merge aggregated data with aggregated one-hot encoded categorical data
df_agg = df_agg.merge(df_cat_agg, on=grouped_columns, how='left')



In [86]:
# Save the preprocessed data to a new CSV file
df_agg.to_csv("preprocessed_data.csv", index=False)

## Feature Engineering

I thought it would be a good idea to create a feature that will represent the proximity of locations to known hot zones where armed clashes occur, which in turn increases the chances of artillery shelling, drones, and missiles strikes affecting said areas. For this batch, I will limit the pool of hot zones to 4 locations: Bakhmut, Soledar, Avdiivka, Vuhledar. Those locations are known as battlefronts where frequent heavy armed clashes between opposing forces occur therefore anything close to them is very likely to be affected. 

In order to create this feature, I will compute the distance between locations in the data set to the hot zones by calculationg the distance between two sets of latitude and longitude points by using the Haversine formula.

In [139]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # Earth radius in kilometers
    earth_radius = 6371

    # Calculate the distance
    distance = earth_radius * c
    return distance


In [140]:
#Define coordinates of the thee battlefront towns
bakhmut_coords = (48.5956, 37.9999)
soledar_coords = (48.6833, 38.0667)
avdiivka_coords = (48.1394, 37.7497)
vuhledar_coords = (48.7798, 37.2490)

In [141]:
#apply the haversine function to the dataset to calculate 
#the distances to each town and find the minimum distance:
df['distance_to_bakhmut'] = haversine(df['latitude'], df['longitude'], bakhmut_coords[0], bakhmut_coords[1])
df['distance_to_soledar'] = haversine(df['latitude'], df['longitude'], soledar_coords[0], soledar_coords[1])
df['distance_to_avdiivka'] = haversine(df['latitude'], df['longitude'], avdiivka_coords[0], avdiivka_coords[1])
df['distance_to_vuhledar'] = haversine(df['latitude'], df['longitude'], vuhledar_coords[0], vuhledar_coords[1])

df['min_distance_to_battlefront'] = df[['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar']].min(axis=1)

# Drop the temporary distance columns
df = df.drop(columns=['distance_to_bakhmut', 'distance_to_soledar', 'distance_to_avdiivka', 'distance_to_vuhledar'])


In [142]:
#Lets see the new column
df['min_distance_to_battlefront'].describe()

count    5000.000000
mean      163.722495
std       150.277014
min         0.000000
25%        28.723117
50%       123.241402
75%       288.093564
max      1010.995940
Name: min_distance_to_battlefront, dtype: float64

As we can observe, the farthest town is ~1011 km away from the battlefront and yet it still is in this dataset meaning that it was affected by remote violance such as missile or drone strikes, despite such large distance from the battlefront.

For the second feature I will aggregate the notes column from the original dataset and create a new column of aggregated notes in df_agg. In turn, I will then use an nlp model such as spaCy to convert the aggregated notes into embeddings which can be used as features for the predictive model. 

In [143]:
df_agg = df.groupby(['location', 'latitude', 'longitude', 'year', 'month'], as_index=False).agg({
    'fatalities': 'sum',
    'min_distance_to_battlefront': 'mean'
})


In [144]:
#Aggregating notes for each location
# Group by location and concatenate the notes separated by ' ||| '
notes_agg = df.groupby('location')['notes'].apply(lambda x: ' ||| '.join(x)).reset_index()

# Merge the aggregated notes with the aggregated DataFrame
df_agg = df_agg.merge(notes_agg, on='location', how='left')


Loading spaCy

In [None]:
import spacy
nlp = spacy.load('en_core_web_md')


In [95]:
def create_embeddings(text):
    return nlp(text).vector


In [145]:
df_agg['notes_embeddings'] = df_agg['notes'].apply(create_embeddings)
df_agg = df_agg.drop(columns=['notes'])

In [146]:
# Convert 'notes_embeddings' to individual columns
notes_embeddings_df = pd.DataFrame(df_agg['notes_embeddings'].to_list(), columns=[f'note_embedding_{i}' for i in range(len(df_agg['notes_embeddings'].iloc[0]))])
df_agg = pd.concat([df_agg.drop(columns=['notes_embeddings']), notes_embeddings_df], axis=1)


In [147]:
#Save the dataset to a new CSV file
df_agg.to_csv("preprocessed_data.csv", index=False)

## Model Selection and Training

1. Splitting the data into train and test sets

In [148]:
from sklearn.model_selection import train_test_split

location_counts = df_agg['location'].value_counts()
threshold = 5  # You can adjust this threshold value as needed.

# Create a new column 'location_grouped' and replace the least common locations with 'Other'
df_agg['location_grouped'] = df_agg['location'].apply(lambda x: x if location_counts[x] > threshold else 'Other')
df_agg = df_agg.drop(columns=['location'])

X = df_agg.drop(columns=['fatalities'])
y = df_agg['fatalities']

# Stratify based on 'location_grouped' and 'month'
stratify = df_agg[['location_grouped', 'month']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=stratify, random_state=42)


2. Feature Scaling: Scale the numerical features (e.g., distance to battlefront) using standardization

In [149]:
from sklearn.preprocessing import OneHotEncoder

# Instantiate the one-hot encoder
encoder = OneHotEncoder(sparse=False)

# Fit the encoder on the entire dataset
encoder.fit(df_agg[['location_grouped']])

# Transform the 'location_grouped' column in the training set
X_train_location_encoded = encoder.transform(X_train[['location_grouped']])
X_train_location_encoded_df = pd.DataFrame(X_train_location_encoded, columns=encoder.get_feature_names_out(['location_grouped']))

# Transform the 'location_grouped' column in the test set
X_test_location_encoded = encoder.transform(X_test[['location_grouped']])
X_test_location_encoded_df = pd.DataFrame(X_test_location_encoded, columns=encoder.get_feature_names_out(['location_grouped']))

# Drop the original 'location_grouped' column and add the encoded columns
X_train_encoded = pd.concat([X_train.drop(columns=['location_grouped']).reset_index(drop=True), X_train_location_encoded_df], axis=1)
X_test_encoded = pd.concat([X_test.drop(columns=['location_grouped']).reset_index(drop=True), X_test_location_encoded_df], axis=1)




In [150]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numerical_features = ['min_distance_to_battlefront']

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])


3. Model Selection

In [153]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Fit the linear regression model on the encoded data
lin_reg = LinearRegression()
lin_reg.fit(X_train_encoded, y_train)

# Make predictions on the test set
y_pred = lin_reg.predict(X_test_encoded)

lin_reg_mse = mean_squared_error(y_test, y_pred)

# Random Forest
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_encoded, y_train)

y_pred = rf_reg.predict(X_test_encoded)
rf_reg_mse = mean_squared_error(y_test, y_pred)


4. Hyperparameter Tuning and Cross Validation

In [157]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 30, 50],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_encoded, y_train)

best_rf_reg = grid_search.best_estimator_

y_pred = best_rf_reg.predict(X_test_encoded)
best_rf_reg_mse = mean_squared_error(y_test, y_pred)


KeyboardInterrupt: 