In [6]:
import pickle
import re

In [7]:
with open("filtered_headlines.pickle", 'rb') as infile:
    dataset = pickle.load(infile)

print(len(dataset))

955


In [8]:
# TODO: "gunman", "SWAT", "ATF agent", "border patrol", "sheriff", any keyword followed immediately by comma

# hand-curated dictionaries of expected participant-referential phrases (PRP's)
POLICE_KWORDS = {r"(?<!police )officer", r"police(?! officer)", r"police officer", r"deput(?:y|ies)", \
                 r"(?:^|\W)cop(?:$|\W)", r"trooper", r"captain", r"(?:[a-z]\.?)*p\.?d\.?(?:$|\W)", r"marshal", \
                 r"(?:^|\W)d\.?e\.?a\.?(?:$|\W)", r"border patrol", r"\Ws\.?w\.?a\.?t\W"}
DECEAS_KWORDS = {"kidnapper", r"suspect(?:|s)(?:\W|$)", "person", r"(?:^|\W)(?:man|woman)(?:$|\W)", r"(?:^|\W)(?:men|women)(?:$|\W)", \
                 r"(?:^|\W)guy(?:$|\W)", r"(?:^|\W)boy(?:$|\W)", r"(?:^|\W)girl(?:$|\W)", r"[0-9]{1,2}(?:-| )year(?:-| )old", r"teen", \
                 r"family(?:-| )member", r"patient", r"inmate", r"guard", r"driver", r"passenger", \
                 r"veteran", r"killer", r"victim", r"father", r"mother", r"fugitive", \
                 r"hostage(?:-| )taker", r"dealer", r"rapper"}
COUNTERS = {r"[1-9]\sdead", r"[0-9]{1,2}\b", r"one", r"two", r"three", r"four", r"five", r"six",\
            r"seven", r"eight", r"nine", r"ten"}
PRONOUNS = {r"they", r"them", r"he", r"him", r"she", r"her"}
WEIRD = r"[^\w &:;,.\-—|/\"“”'‘’\s]"
DEMOGRAPHICS = [r"age", r"gender", r"race", r"state"]

# exceptions:
#     Ex-Alabama trooper shoots DA in face, is killed by police
#     Former police captain shot and killed by officer in northeast Fresno | The Fresno Bee


In [9]:
# small data correction
random_artifact = r"(?:\u00e2\u0080\u0098|\u00e2\u0080\u0099)"
for datapoint in dataset:
    if re.search(random_artifact, datapoint["headline"]):
        headline = re.sub(random_artifact, "", datapoint["headline"])
        datapoint["headline"] = headline

In [6]:
# add label fields to datapoints

from copy import deepcopy

labeled_dataset = deepcopy(dataset)

for d in labeled_dataset:
    
    h = d["headline"]
    terms = []
    
    for kw in DECEAS_KWORDS:
        if re.search(kw, h, re.IGNORECASE):
            terms.append(kw)
            
    assert(len(terms) <= 3) # at most 3 labels
    terms.extend([''] * (3 - len(terms))) # pad to length 3
    assert(len(terms) == 3)
    
    for kw in POLICE_KWORDS:
        if re.search(kw, h, re.IGNORECASE):
            terms.append(kw)
            
    assert(len(terms) <= 6) # at most 6 labels
    terms.extend([''] * (6 - len(terms))) # pad to length 6
    assert(len(terms) == 6)
    
    for i, t in enumerate(terms):
        d["term" + str(i)] = t


In [7]:
# write overall dataset to csv because analysis is easiest in R

with open("labeled_dataset.csv", 'w') as outf:
    outf.write(",".join(sorted(d.keys())) + "\n")
    for d in labeled_dataset:
        outf.write(",".join(['"' + b.replace('"', r'\"') + '"' for a,b in sorted(list(d.items()))]) + "\n")

In [None]:
################################################################################
# From here onwards is random, sprawling analysis in python, ignore it.        #
#                                                                              #
# This is just a relic from late night work getting quick and dirty stats for  #
# presentation I had the next morning.                                         #
################################################################################

In [6]:
# general count of all data demographics

from collections import Counter

age_demogr_count = Counter()
gender_demogr_count = Counter()
race_demogr_count = Counter()
state_demogr_count = Counter()
demogr_counts = [age_demogr_count, gender_demogr_count, race_demogr_count, state_demogr_count]

for demogr_count, DEMO in zip(demogr_counts, DEMOGRAPHICS):
    for d in dataset:
        demogr = d[DEMO]
        
        if demogr == '':
            demogr = "Unknown " + DEMO
            
        demogr_count.update([demogr])

for demogr_count, DEMO in zip(demogr_counts, DEMOGRAPHICS):
    print("\n" + DEMO + "\n--------")
    print("\n".join([a + (( int((15 - len(a))/8)+1 )*"\t") + str(b) for a, b in demogr_count.most_common()]))


age
--------
30 to 44	303
18 to 29	243
45 and up	203
Unknown age	196
Under 18	12

gender
--------
Male		754
Unknown gender	153
Female		50

race
--------
White		362
Unknown race	239
Black		207
Hispanic	124
Other race	25

state
--------
California	107
Texas		69
Florida		69
Arizona		43
Colorado	42
Georgia		41
Ohio		36
Tennessee	30
Nevada		30
Illinois	27
Utah		26
New Mexico	24
North Carolina	24
Missouri	23
Pennsylvania	23
Michigan	23
Arkansas	21
Washington	19
New York	19
Minnesota	19
Hawaii		18
New Jersey	18
Iowa		17
Virginia	17
Oklahoma	17
Kentucky	17
Oregon		15
Wisconsin	15
Louisiana	14
Maryland	13
Indiana		12
South Carolina	12
Idaho		11
Alabama		9
Alaska		7
Kansas		5
Mississippi	5
Maine		3
Montana		3
Massachusetts	3
South Dakota	2
New Hampshire	2
North Dakota	2
Connecticut	1
Rhode Island	1
Nebraska	1
Wyoming		1
D.C.		1


In [10]:
from collections import Counter

# make a dict with keys as PRP's and vals as sets of datapoints
deceas_PRP = {}
label_count = [0] * len(dataset)

# keys are labels and vals are counters
age_count = {}
gender_count = {}
race_count = {}
state_count = {}
demogr_counts = [age_count, gender_count, race_count, state_count]

# detect PRP's denoting the deceased
for i,d in enumerate(dataset):
    h = d["headline"]
    demogrs = [d["age"], d["gender"], d["race"], d["state"]]
    
    for kw in DECEAS_KWORDS:
        if re.search(kw, h, re.IGNORECASE):
            if kw not in deceas_PRP or not deceas_PRP[kw]:
                deceas_PRP[kw] = {h}
            else:
                deceas_PRP[kw].add(h)
            label_count[i] += 1
            
            for demogr_count, DEMO in zip(demogr_counts, DEMOGRAPHICS):
                demogr = d[DEMO]
                
                if kw not in demogr_count:
                    demogr_count[kw] = Counter([demogr])
                else:
                    demogr_count[kw].update([demogr])

unlabeled = []
overlabeled = []

unlabeled_age_count = Counter()
unlabeled_gender_count = Counter()
unlabeled_race_count = Counter()
unlabeled_state_count = Counter()
unlabeled_demogr_counts = [unlabeled_age_count, unlabeled_gender_count, \
                           unlabeled_race_count, unlabeled_state_count]

for i,d in enumerate(dataset):
    if label_count[i] == 0:
        unlabeled.append(d["headline"])
        
        for demogr_count, demogr in zip(unlabeled_demogr_counts, DEMOGRAPHICS):
            val = d[demogr]
            if val == '':
                val = "Unknown " + demogr
            demogr_count.update([val])

    elif label_count[i] > 1:
        overlabeled.append((label_count[i], d["headline"]))

#print("0  label:  " + str(len(unlabeled)))
#print("1  label:  " + str(len(label_count) - len(unlabeled) - len(overlabeled)))
#print("2+ labels: " + str(len(overlabeled))) # TODO hand correct

#print("\nlabel\t\t\t\t\tinstances\n-----\t\t\t\t\t---------")
#print("\n".join([a + ((int((31 - len(a))/8) + 2)*"\t") + str(len(b)) for a,b in deceas_PRP.items()]))

print("\n".join(unlabeled))
#print("\n".join([str(a) + ": " + b for a,b in overlabeled]))

#for demogr_count, demogr in zip(unlabeled_demogr_counts, DEMOGRAPHICS):
#    print("\n" + "unlabeled " + demogr + "\n--------")
#    print("\n".join(["\t" + a + (( int((15 - len(a))/8)+1 )*"\t") + str(b) for a, b in demogr_count.most_common()]))

#for demogr_count, demogr in zip(demogr_counts, DEMOGRAPHICS):
#    print("\n" + demogr + "\n--------")
#
#    for label, counter in demogr_count.items():
#        if '' in counter:
#            n = counter['']
#            del counter['']
#
#            key = "Unknown " + demogr
#            if key in counter:
#                counter[key] += n
#            else:
#                counter[key] = n
#
#        print(label)
#        print("\n".join(["\t" + a + (( int((15 - len(a))/8)+1 )*"\t") + str(b) for a, b in counter.most_common()]))

Officials to investigate use of deadly force by Webster County police on New Year's Eve
One killed in St. Joseph officer-involved shooting
Family speaks about loved one shot by Port Arthur police Friday
Maui police release body camera footage of deadly officer-involved shooting
Subject, officer in fatal Springfield officer-involved shooting ID'd
Chase leads to fatal officer-involved shooting in Lake Elsinore
Officer-Involved Shooting Reported In Lake Elsinore
Phoenix police involved in shooting near 36th Street and Oak
Probe ongoing in officer-involved shooting in Union City
Investigation underway after deadly officer-involved shooting in Oklahoma City
TBI Investigates Deadly MPD Officer-Involved Shooting
2 dead in separate Jacksonville police shootings Wednesday, 1 had BB gun
One dead after officer-involved shooting in Washington County
Dramatic video captures fatal officer-involved shooting Saturday in West Wendover
One dead after Fort Pierce police officer-involved shooting
The scen

In [11]:
# make a dict with keys as PRP's and vals as sets of datapoints

from collections import Counter

police_PRP = {}
label_count = [0] * len(dataset)

state_counts = {}

# detect PRP's denoting the police
for i,d in enumerate(dataset):
    h = d["headline"]
    state = d["state"]
    for kw in POLICE_KWORDS:
        if re.search(kw, h, re.IGNORECASE):
            if kw not in police_PRP or not police_PRP[kw]:
                police_PRP[kw] = {h}
            else:
                police_PRP[kw].add(h)
            label_count[i] += 1
            
            if kw not in state_counts:
                state_counts[kw] = Counter([state])
            else:
                state_counts[kw].update([state])
            
unlabeled = []
overlabeled = []

unlabeled_state_count = Counter()

for i,d in enumerate(dataset):
    if label_count[i] == 0:
        unlabeled.append(d["headline"])
        
        state = d["state"]
        if state == '':
            state = "Unknown state"
        unlabeled_state_count.update([state])
        
    elif label_count[i] > 1:
        overlabeled.append((label_count[i], d["headline"]))

#print("0  label:  " + str(len(unlabeled)))
#print("1  label:  " + str(len(label_count) - len(unlabeled) - len(overlabeled)))
#print("2+ labels: " + str(len(overlabeled))) # TODO hand correct

print("\n".join(unlabeled))
#print("\n".join([str(a) + ": " + b for a,b in overlabeled]))

#print("\nlabel\t\t\t\tinstances\n-----\t\t\t\t---------")
#print("\n".join([a + (int((31 - len(a))/8 + 1)*"\t") + str(len(b)) for a,b in police_PRP.items()]))

#print("\nunlabeled state\n--------")
#print("\n".join(["\t" + a + (( int((15 - len(a))/8)+1 )*"\t") + str(b) for a, b in unlabeled_state_count.most_common()]))

#for label, counter in state_counts.items():
#    if '' in counter:
#        n = counter['']
#        del counter['']
#
#        key = "Unknown state"
#        if key in counter:
#            counter[key] += n
#        else:
#            counter[key] = n
#
#    print("\n" + label + "\n--------")
#    print("\n".join(["\t" + a + (( int((15 - len(a))/8)+1 )*"\t") + str(b) for a, b in counter.most_common()]))


Shooting Victim in Carbon County Chase Dies; Four Arrested
Nye sheriff releases body cam footage of fatal shooting
Suspect killed in stolen car chase wanted for stealing show dogs
Suspect killed in South Salt Lake critical incident Saturday identified
Friends and family remember young man killed in Thanksgiving standoff
Hoover offers deepest sympathies after deadly Alabama mall shooting, vows transparency
4 shooting deaths in 5 days in Magnolia shocks Arkansas's small town
1 Dead, Multiple Wounded in Watts Shooting; Suspect in Custody
Details On Fatal Shooting Of Manahawkin Man Wanted In Kidnapping
Mantoloking fatal shooting: Suspect served probation for assault; K9 back home
Jesus “Chuy” Guzman Triple Murder Suspect Killed in Shootout
SWAT team fatally shoots man during Osceola County standoff, sheriff says
GBI identifies man shot, killed by cops outside Cobb apartment complex
'Don't let anybody forget it'
Cops: Man fatally shoots girlfriend’s grandma before being killed by Cobb SWAT


In [106]:
names = set()

# check for proper names
for d in dataset:
    if d["name"] == "An unidentified person":
        continue

    lastname = re.findall(r"\s(\w+)$", d["name"])
    
    if lastname:
        lastname = lastname[0]
        names.add(lastname)
        
        if re.search(lastname, d["headline"]):
            print(d["headline"])
            
            # only two hits, both false positives conflating county and surname
            
# TODO: for some reason my names are offset from their datapoints, but all the other
# demographics match up, have to fix this (should be able to join by all other features
# on fatal force csv and then migrate over name?)

# Less than 10 uses of victim name though:
# ----
# Family IDs Victim Of Deadly Shakopee Police Shooting As James Hanchett
# Jesus “Chuy” Guzman Triple Murder Suspect Killed in Shootout
# Dashcam video released of Halloween deputy-involved shooting that killed 17-year-old Jose Centeno Jr.
# Family IDs Hastings Police Shooting Victim As Keagan Johnson
# ‘ER’ Actress Vanessa Marquez Shot, Killed By Police After Brandishing BB Gun
# Woman Fatally Shot by Officers in South Pasadena ID’d as ‘ER’ Actress Vanessa Marquez
# As new details emerge, the Brett Luengo shot on I-90 is not the man loved ones knew

UPDATE: New details in Montgomery County deputy-involved shooting
Mother doesn't blame Montgomery County deputies for son's death
Person dead after officer-involved shooting in rural Webster County
Officials to investigate use of deadly force by Webster County police on New Year's Eve
Seattle police are reviewing video of fatal shooting of South King County father of two
Police ID man killed in officer-involved shooting on Green Street
Family speaks about loved one shot by Port Arthur police Friday
Lakeland officers shoot, kill driver of stolen car in crowded parking lot
Man suspected of killing his Warren grandmother dies in police scuffle
Chase leads to fatal officer-involved shooting in Lake Elsinore
Officer-Involved Shooting Reported In Lake Elsinore
2 dead in separate Jacksonville police shootings Wednesday, 1 had BB gun
Officers In Western Colorado Kill Stolen Vehicle Suspect
One dead after officer-involved shooting in Washington County
Dramatic video captures fatal officer-invol