In [1]:
pwd

'/Users/peterhaglich/Dropbox/Work/IARPA/Mercury/peterhaglich/mercury-challenge/src/ExpressScore/notebooks'

In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from dateutil.parser import parse
import datetime
import calendar
import json
import os
import re

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use("fivethirtyeight")

from collections import Counter

from geopy.distance import distance

In [4]:
EVENT_TYPE = "Military Action"
EVT_ABBR = "MA"
COUNTRY = "Saudi Arabia"

month_str = "May 2018"
month_path_str = "_".join(month_str.split(" "))

FIRST_DATE = parse("2018-05-01")
LAST_DATE = parse("2018-05-31")

MERC_CHALLENGE_HOME = os.path.abspath("../../../..")

MERC_HOME = os.path.join(MERC_CHALLENGE_HOME, "..", "mercury")
EXPRESS_SCORE_PATH = os.path.join(MERC_CHALLENGE_HOME, "src", "ExpressScore")
print(os.listdir(EXPRESS_SCORE_PATH))
ES_TEST_RESOURCE_PATH = os.path.join(EXPRESS_SCORE_PATH, "resources", "test", "sa_ma_may_2018")
DATA_HOME = os.path.join(MERC_HOME, "data")
WARN_PATH = os.path.join(DATA_HOME, "baserate_warnings", "MANSA")
month_warn_path = os.path.join(WARN_PATH, month_path_str)
MANSA_GSR_PATH = os.path.join(MERC_CHALLENGE_HOME, "data", "gsr", "ma_gsr")

['.DS_Store', '__init__.py', '__pycache__', 'main', 'notebooks', 'README.md', 'resources', 'test']


In [5]:
br_warn_filename = "Baserate_MANSA_{}.json".format(month_path_str)
br_warn_path = os.path.join(WARN_PATH, br_warn_filename)
with open(br_warn_path, "r", encoding="utf8") as f:
    br_warn = json.load(f)
br_warn = [w for w in br_warn["payload"] if w["Event_Type"] == "Military Action"]
br_country_counts = Counter([w["Country"] for w in br_warn])
print(br_country_counts)

Counter({'Syria': 1685, 'Iraq': 529, 'Lebanon': 42, 'Egypt': 35, 'Saudi Arabia': 5, 'Yemen': 5})


In [6]:
gsr_filename = "MA_{}.json".format(month_path_str)
gsr_path = os.path.join(MANSA_GSR_PATH, gsr_filename)
with open(gsr_path, "r", encoding="utf8") as f:
    gsr = json.load(f)
ma_gsr = [e for e in gsr if e["Event_Type"] == "Military Action"]
gsr_country_counts = Counter([e["Country"] for e in ma_gsr])
print(gsr_country_counts)

Counter({'Syria': 884, 'Iraq': 439, 'Saudi Arabia': 13, 'Egypt': 10, 'Yemen': 6, 'Lebanon': 4})


In [7]:
test_evt = [e for e in ma_gsr if e["Country"] == COUNTRY][0]
test_evt

{'Actor': 'Royal Saudi Military',
 'Approximate_Location': 'False',
 'City': 'Najrān',
 'Country': 'Saudi Arabia',
 'Earliest_Reported_Date': '2018-05-06',
 'Event_Date': '2018-05-06',
 'Event_ID': 'MN267278',
 'Event_Subtype': 'Conflict',
 'Event_Type': 'Military Action',
 'First_Reported_Link': 'https://www.elnashra.com/news/show/1207260/%D8%A7%D9%84%D8%AF%D9%81%D8%A7%D8%B9-%D8%A7%D9%84%D8%AC%D9%88%D9%8A-%D8%A7%D9%84%D8%B3%D8%B9%D9%88%D8%AF%D9%8A-%D9%8A%D8%AF%D9%85%D8%B1-%D8%B5%D8%A7%D8%B1%D9%88%D8%AE%D9%8A%D9%86-%D8%A8%D8%A7%D9%84%D9%8A%D8%B3%D8%AA%D9%8A%D9%8A%D9%86-%D8%A3%D8%B7%D9%84%D9%82%D9%87%D9%85',
 'GSS_Link': 'https://www.elnashra.com/news/show/1207260/%D8%A7%D9%84%D8%AF%D9%81%D8%A7%D8%B9-%D8%A7%D9%84%D8%AC%D9%88%D9%8A-%D8%A7%D9%84%D8%B3%D8%B9%D9%88%D8%AF%D9%8A-%D9%8A%D8%AF%D9%85%D8%B1-%D8%B5%D8%A7%D8%B1%D9%88%D8%AE%D9%8A%D9%86-%D8%A8%D8%A7%D9%84%D9%8A%D8%B3%D8%AA%D9%8A%D9%8A%D9%86-%D8%A3%D8%B7%D9%84%D9%82%D9%87%D9%85',
 'Latitude': 17.5,
 'Longitude': 44.1667,
 'News_Source

In [8]:
cc_gsr = [e for e in ma_gsr if e["Country"] == COUNTRY]
print(len(cc_gsr))
for i, e in enumerate(cc_gsr):
    new_id = "MN{}".format(i)
    e["Event_ID"] = new_id
out_filename = "test_cc_gsr.json"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
with open(out_path, "w") as f:
    json.dump(cc_gsr, f, ensure_ascii=False, indent=2)

13


In [9]:
cc_warn = [w for w in br_warn if w["Country"] == COUNTRY]
cc_warn = [w for w in cc_warn if parse(w["Event_Date"]) <= LAST_DATE
           and parse(w["Event_Date"]) >= FIRST_DATE]
for w in cc_warn:
    if w["Event_Subtype"] == "Armed Conflict":
        w["Event_Subtype"] = "Conflict"
    keys_ = list(w.keys())
    for k in keys_:
        if re.findall("Target", k):
            del w[k]
print(len(cc_warn))
for i, w in enumerate(cc_warn):
    new_id = "BR_{}".format(i)
    w["Warning_ID"] = new_id
out_filename = "test_cc_warnings.json"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
with open(out_path, "w") as f:
    json.dump(cc_warn, f, ensure_ascii=False, indent=2)

5


In [10]:
cc_gsr[0]

{'Actor': 'Royal Saudi Military',
 'Approximate_Location': 'False',
 'City': 'Najrān',
 'Country': 'Saudi Arabia',
 'Earliest_Reported_Date': '2018-05-06',
 'Event_Date': '2018-05-06',
 'Event_ID': 'MN0',
 'Event_Subtype': 'Conflict',
 'Event_Type': 'Military Action',
 'First_Reported_Link': 'https://www.elnashra.com/news/show/1207260/%D8%A7%D9%84%D8%AF%D9%81%D8%A7%D8%B9-%D8%A7%D9%84%D8%AC%D9%88%D9%8A-%D8%A7%D9%84%D8%B3%D8%B9%D9%88%D8%AF%D9%8A-%D9%8A%D8%AF%D9%85%D8%B1-%D8%B5%D8%A7%D8%B1%D9%88%D8%AE%D9%8A%D9%86-%D8%A8%D8%A7%D9%84%D9%8A%D8%B3%D8%AA%D9%8A%D9%8A%D9%86-%D8%A3%D8%B7%D9%84%D9%82%D9%87%D9%85',
 'GSS_Link': 'https://www.elnashra.com/news/show/1207260/%D8%A7%D9%84%D8%AF%D9%81%D8%A7%D8%B9-%D8%A7%D9%84%D8%AC%D9%88%D9%8A-%D8%A7%D9%84%D8%B3%D8%B9%D9%88%D8%AF%D9%8A-%D9%8A%D8%AF%D9%85%D8%B1-%D8%B5%D8%A7%D8%B1%D9%88%D8%AE%D9%8A%D9%86-%D8%A8%D8%A7%D9%84%D9%8A%D8%B3%D8%AA%D9%8A%D9%8A%D9%86-%D8%A3%D8%B7%D9%84%D9%82%D9%87%D9%85',
 'Latitude': 17.5,
 'Longitude': 44.1667,
 'News_Source': 'E

In [11]:
cc_warn[0]

{'City': 'Najrān',
 'Country': 'Saudi Arabia',
 'Actor': 'Yemeni Military',
 'timestamp': '2018-05-08T7:53:50.0',
 'Longitude': 44.1277,
 'Probability': 0.7851473626,
 'Event_Subtype': 'Conflict',
 'Event_Type': 'Military Action',
 'State': 'Najrān',
 'Latitude': 17.4933,
 'Event_Date': '2018-05-11'}

In [12]:
def dist(warn, evt):
    w_lat = warn["Latitude"]
    w_long = warn["Longitude"]
    e_lat = evt["Latitude"]
    e_long = evt["Longitude"]
    return distance((w_lat, w_long), (e_lat, e_long)).km
def dist_to_warn(w, gsr_list):
    return [dist(w, e) for e in gsr_list]
def date_diff(warn, evt):
    w_date = parse(warn["Event_Date"])
    e_date = parse(evt["Event_Date"])
    delta = (w_date - e_date).days
    return delta
def date_diff_to_warn(w, gsr_list):
    return [date_diff(w, e) for e in gsr_list]
def es_match(warn, evt):
    w_es = warn["Event_Subtype"]
    e_es = evt["Event_Subtype"]
    return (w_es == e_es)
def es_match_to_warn(w, gsr_list):
    return [int(es_match(w, e)) for e in gsr_list]
def actor_match(warn, evt):
    e_actors = evt["Actor"].split(";")
    return (warn["Actor"] in e_actors)
def actor_match_to_warn(w, gsr_list):
    return [int(actor_match(w, e)) for e in gsr_list]
def ls(dist):
    return 1 - dist/100.0
ls_vfunc = np.vectorize(ls)
def ds(date_diff):
    return 1 - abs(date_diff)/4.0
ds_vfunc = np.vectorize(ds)

In [13]:
print(date_diff(cc_warn[0], cc_gsr[0]))
print(date_diff_to_warn(cc_warn[0], cc_gsr))

print(dist(cc_warn[0], cc_gsr[0]))

print(dist_to_warn(cc_warn[0], cc_gsr))
print(es_match(cc_warn[0], cc_gsr[0]))

print(es_match_to_warn(cc_warn[0], cc_gsr))
print(actor_match(cc_warn[0], cc_gsr[0]))
print(actor_match_to_warn(cc_warn[0], cc_gsr))

5
[5, 5, -5, 2, -8, -3, -11, 2, -13, -4, -14, -14, -15]
4.20770785632621
[4.20770785632621, 180.54753953718352, 180.54753953718352, 840.8493508610503, 172.63879261550903, 180.54753953718352, 180.54753953718352, 180.54753953718352, 180.54753953718352, 180.54753953718352, 180.54753953718352, 0.0, 189.6700424968016]
True
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
False
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


In [14]:
print(cc_warn[0]["Event_Subtype"], cc_gsr[0]["Event_Subtype"], cc_gsr[1]["Event_Subtype"])

Conflict Conflict Conflict


In [15]:
dist_array = [dist_to_warn(w, cc_gsr) for w in cc_warn]
dist_array = np.array(dist_array).reshape(len(cc_warn), len(cc_gsr))

dist_df = pd.DataFrame(dist_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

dist_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,4.207708,180.54754,180.54754,840.849351,172.638793,180.54754,180.54754,180.54754,180.54754,180.54754,180.54754,0.0,189.670042
BR_1,1172.818382,1321.887677,1321.887677,393.000887,1184.503806,1321.887677,1321.887677,1321.887677,1321.887677,1321.887677,1321.887677,1175.537753,1206.712422
BR_2,184.674907,0.0,0.0,966.245601,157.334543,0.0,0.0,0.0,0.0,0.0,0.0,180.54754,146.968824
BR_3,628.534464,577.107005,577.107005,793.156627,459.787786,577.107005,577.107005,577.107005,577.107005,577.107005,577.107005,626.099315,451.651608
BR_4,4.207708,180.54754,180.54754,840.849351,172.638793,180.54754,180.54754,180.54754,180.54754,180.54754,180.54754,0.0,189.670042


In [16]:
out_filename = "test_cc_dist_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
dist_df.to_csv(out_path)

In [17]:
max_dist_array = 100*np.ones(shape=(len(cc_warn), len(cc_gsr)))
ls_dist_array = np.minimum(dist_array, max_dist_array)
ls_array = ls_vfunc(ls_dist_array)
ls_df = pd.DataFrame(ls_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

ls_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,0.957923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
BR_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_2,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
BR_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_4,0.957923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
out_filename = "test_ls_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
ls_df.to_csv(out_path)

In [19]:
date_diff_array = [date_diff_to_warn(w, cc_gsr) for w in cc_warn]
date_diff_array = np.array(date_diff_array).reshape(len(cc_warn), len(cc_gsr))

date_diff_df = pd.DataFrame(date_diff_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

date_diff_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,5,5,-5,2,-8,-3,-11,2,-13,-4,-14,-14,-15
BR_1,12,12,2,9,-1,4,-4,9,-6,3,-7,-7,-8
BR_2,25,25,15,22,12,17,9,22,7,16,6,6,5
BR_3,18,18,8,15,5,10,2,15,0,9,-1,-1,-2
BR_4,25,25,15,22,12,17,9,22,7,16,6,6,5


In [20]:
out_filename = "test_cc_date_diff_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
date_diff_df.to_csv(out_path)

In [21]:
date_diff_array = np.abs(date_diff_array)
max_dd_array = 4*np.ones(shape=(len(cc_warn), len(cc_gsr)))
min_dd_array = np.zeros(shape=(len(cc_warn), len(cc_gsr)))
ds_dd_array = np.minimum(date_diff_array, max_dd_array)

ds_dd_array = np.maximum(ds_dd_array, min_dd_array)

ds_array = ds_vfunc(ds_dd_array)
ds_df = pd.DataFrame(ds_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

ds_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,0.0,0.0,0.0,0.5,0.0,0.25,0.0,0.5,0.0,0.0,0.0,0.0,0.0
BR_1,0.0,0.0,0.5,0.0,0.75,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
BR_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_3,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,0.0,0.75,0.75,0.5
BR_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
out_filename = "test_ds_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
ds_df.to_csv(out_path)

In [23]:
es_array = [es_match_to_warn(w, cc_gsr) for w in cc_warn]
es_array = np.array(es_array).reshape(len(cc_warn), len(cc_gsr))

es_df = pd.DataFrame(es_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

es_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,1,1,1,1,1,1,1,1,1,1,1,1,1
BR_1,0,0,0,0,0,0,0,0,0,0,0,0,0
BR_2,1,1,1,1,1,1,1,1,1,1,1,1,1
BR_3,1,1,1,1,1,1,1,1,1,1,1,1,1
BR_4,1,1,1,1,1,1,1,1,1,1,1,1,1


In [24]:
out_filename = "test_es_match_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
es_df.to_csv(out_path)

In [25]:
acs_array = [actor_match_to_warn(w, cc_gsr) for w in cc_warn]
acs_array = np.array(acs_array).reshape(len(cc_warn), len(cc_gsr))
acs_df = pd.DataFrame(acs_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

acs_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,0,0,0,0,0,1,0,0,0,0,0,0,0
BR_1,1,1,1,1,1,0,1,1,1,1,1,1,1
BR_2,1,1,1,1,1,0,1,1,1,1,1,1,1
BR_3,1,1,1,1,1,0,1,1,1,1,1,1,1
BR_4,1,1,1,1,1,0,1,1,1,1,1,1,1


In [26]:
out_filename = "test_actor_match_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
acs_df.to_csv(out_path)

In [27]:
is_approx_list = [eval(e["Approximate_Location"]) for e in cc_gsr]
is_approx_array = np.array(is_approx_list*len(cc_warn)).reshape(len(cc_warn), len(cc_gsr))
is_approx_df = pd.DataFrame(is_approx_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

is_approx_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,False,False,False,True,False,False,False,False,True,True,False,False,False
BR_1,False,False,False,True,False,False,False,False,True,True,False,False,False
BR_2,False,False,False,True,False,False,False,False,True,True,False,False,False
BR_3,False,False,False,True,False,False,False,False,True,True,False,False,False
BR_4,False,False,False,True,False,False,False,False,True,True,False,False,False


In [28]:
out_filename = "test_approx_location_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
is_approx_df.to_csv(out_path)

In [29]:
qs_mat = ls_array + ds_array + es_array + acs_array
qs_mat

array([[1.95792292, 1.        , 1.        , 1.5       , 1.        ,
        2.25      , 1.        , 1.5       , 1.        , 1.        ,
        1.        , 2.        , 1.        ],
       [1.        , 1.        , 1.5       , 1.        , 1.75      ,
        0.        , 1.        , 1.        , 1.        , 1.25      ,
        1.        , 1.        , 1.        ],
       [2.        , 3.        , 3.        , 2.        , 2.        ,
        2.        , 3.        , 3.        , 3.        , 3.        ,
        3.        , 2.        , 2.        ],
       [2.        , 2.        , 2.        , 2.        , 2.        ,
        1.        , 2.5       , 2.        , 3.        , 2.        ,
        2.75      , 2.75      , 2.5       ],
       [2.95792292, 2.        , 2.        , 2.        , 2.        ,
        1.        , 2.        , 2.        , 2.        , 2.        ,
        2.        , 3.        , 2.        ]])

In [30]:
qs_mat[ls_array == 0] = 0
qs_mat[ds_array == 0] = 0
qs_df = pd.DataFrame(qs_mat,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

qs_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9,MN10,MN11,MN12
BR_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
out_filename = "test_qs_mat.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
qs_df.to_csv(out_path)