In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy
import en_core_web_sm
import pandas as pd

In [2]:
ranked_reports = {
    "Rank 1 - Police Presence": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
    ],
    "Rank 2 - Empty-hand": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "pushed and shoved with shields",
        "grabs, holds and joint locks",
        "punch and kick",
    ],
    "Rank 3 - Blunt Force": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "rubber bullets",
        "riot rounds",
        "batons",
    ],
    "Rank 4 - Chemical & Electric": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "tear gas",
        "pepper spray",
        "flashbangs, stun grenade",
        "chemical sprays",
        "Conducted energy devices, CED or tazor",
    ],
    "Rank 5 - Lethal Force": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "shoot and kill",
        "open fire",
        "deadly force",
        "fatal",
        "dies",
    ],
}

In [6]:
ranked_reports1 = {
    "Rank 1 - Police Presence": [
        "policeman", "policewoman", "law enforcement",
        "police officer, cop, five-o, fuzz, DHS", 
        "protester", "FPS", "officer",
        "Federal Protective Services",
    ],
    "Rank 2 - Empty-hand": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz, DHS",
        "pushed and shoved with shields", "officer",
        "grabs, holds and joint locks",
        "punch and kick", "thrown to the ground", "hit",
        "charge a protester", "tackle to the ground", 
        "kneel on", "arrest", "protester",
        "FPS", "Federal Protective Services", "zip-ties",
        "police chase and attack", "kicking him", 
        "threw him to the ground", "handcuff him", 
        "kneeling on a protester", "pinning down", 
        "tackle", "shoved to the ground", "violent",
        "officer shove"

    ],
    "Rank 3 - Blunt Force": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz", "DHS",
        "rubber bullets", "officer",
        "riot rounds",
        "batons", "blood", "hit", "arrest",
        "protester", "FPS", 
        "Federal Protective Services", 
        "strike with baton", "violent",


    ],
    "Rank 4 - Chemical & Electric": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz", "DHS",
        "tear gas", "officer",
        "pepper spray",
        "flashbangs", "stun grenade",
        "chemical sprays",
        "Conducted energy devices, CED or tazor",
        "blood", "arrest", "protester", "FPS", 
        "Federal Protective Services", "pepper balls",
        "using munitions on prosters", "struck by a round",
        "fire pepper balls and tear gas", 
        "struck in chest by projectile", "violent", 
        "munition", "firing a riot gun", "paintball gun",
        "shots are fired", "fire explosives", 
        "fire impact munitions",


    ],
    "Rank 5 - Lethal Force": [
        "policeman", "policewoman", "law enforcement",
        "police officer", "cop", "five-o", "fuzz", "DHS",
        "shoot and kill", "protester",
        "open fire", "FPS", "officer",
        "Federal Protective Services",
        "deadly force", "fatal",
        "dies", 'kill', "arrest", "violent", 
        "shot and killed",

    ],
}

In [3]:
class TextMatcher:
    """ Generic NLP Text Matching Model """

    class Tokenizer:
        """ Standard SpaCy Tokenizer """
        nlp = en_core_web_sm.load()

        def __call__(self, text: str) -> list:
            return [
                token.lemma_ for token in self.nlp(text)
                if not token.is_stop and not token.is_punct
            ]

    def __init__(self, train_data: dict, ngram_range=(1, 3), max_features=8000):
        """ Model training on live data at init """
        self.lookup = {k: ' '.join(v) for k, v in train_data.items()}
        self.name_index = list(self.lookup.keys())
        self.tfidf = TfidfVectorizer(
            ngram_range=ngram_range,
            tokenizer=self.Tokenizer(),
            max_features=max_features,
        )
        self.knn = NearestNeighbors(
            n_neighbors=1,
            n_jobs=-1,
        ).fit(self.tfidf.fit_transform(self.lookup.values()).todense())
        self.baseline, _ = self._worker('')

    def _worker(self, user_input: str):
        """ Prediction worker method - internal only """
        vec = self.tfidf.transform([user_input]).todense()
        return (itm[0][0] for itm in self.knn.kneighbors(vec))

    def __call__(self, user_input: str) -> str:
        """ Callable object for making predictions """
        dist, idx = self._worker(user_input)
        if dist != self.baseline:
            return self.name_index[int(idx)]
        else:
            return 'Rank 0 - No Police Presence'

In [4]:
textmatcher = TextMatcher(ranked_reports)

In [7]:
textmatcher1 = TextMatcher(ranked_reports1)

In [12]:
text = """
During a protest at which both pro-police and Black Lives Matter protesters were present near West 7th and Lincoln, members of the two protests appear to exchange words in an intersection. Police charge a protester, a 14 year old girl according to the poster, tackle her to the ground, kneel on her and arrest her.
"""

In [13]:
textmatcher(text)

'Rank 1 - Police Presence'

In [14]:
textmatcher1(text)

'Rank 2 - Empty-hand'

In [17]:
df = pd.read_csv("../static/reddit_data.csv")

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,dates,added_on,links,case_id,city,state,lat,long,title,description,tags,force_rank
0,0,1,2020-05-30 00:00:00.000000,2021-02-26 20:45:37.445121,['https://www.youtube.com/watch?v=dPTr54DdTdY'...,or-eugene-2,Eugene,Oregon,44.049913,-123.097391,Peaceful protester tear gassed,Footage shows a small cluster of protesters ch...,"['less-lethal', 'protester', 'tear-gas']",Rank 4 - Chemical & Electric
1,1,2,2020-05-30 00:00:00.000000,2021-02-26 20:45:37.446934,['https://twitter.com/IwriteOK/status/12669077...,or-portland-1,Portland,Oregon,45.515586,-122.676994,Police violently break up peaceful protest,The Police try to break up the peaceful protes...,"['baton', 'beat', 'protester', 'strike']",Rank 1 - Police Presence
2,2,3,2020-05-30 00:00:00.000000,2021-02-26 20:45:37.447560,['https://www.youtube.com/watch?v=01oWE24O9Zw&...,or-portland-2,Portland,Oregon,45.515468,-122.676571,Officer pepper-sprays protester,An officer shoves and pepper-sprays a proteste...,"['less-lethal', 'pepper-spray', 'protester', '...",Rank 4 - Chemical & Electric
3,3,4,2020-05-31 00:00:00.000000,2021-02-26 20:45:37.448138,['https://www.youtube.com/watch?v=R7fqWI41vQY'...,or-eugene-3,Eugene,Oregon,44.044728,-123.079238,Reporter shot with tear gas canister,Footage shows police giving curfew dispersal o...,"['journalist', 'less-lethal', 'rubber-bullet',...",Rank 4 - Chemical & Electric
4,4,5,2020-06-01 00:00:00.000000,2021-02-26 20:45:37.448824,['https://www.reddit.com/r/Eugene/comments/gur...,or-eugene-1,Eugene,Oregon,0.0,0.0,Officer shoots projectile from moving vehicle,An officer shoots a projectile out of a moving...,"['less-lethal', 'projectile', 'protester', 'sh...",Rank 5 - Lethal Force


In [22]:
df["force_rank1"] = df['description'].apply(lambda x:textmatcher(x))

In [24]:
df['force_rank1'].value_counts()

Rank 4 - Chemical & Electric    486
Rank 1 - Police Presence        245
Rank 2 - Empty-hand             233
Rank 3 - Blunt Force            200
Rank 5 - Lethal Force           113
Rank 0 - No Police Presence      19
Name: force_rank1, dtype: int64

In [25]:
df['force_rank'].value_counts()

Rank 4 - Chemical & Electric    387
Rank 1 - Police Presence        359
Rank 2 - Empty-hand             205
Rank 5 - Lethal Force           160
Rank 3 - Blunt Force            106
Rank 0 - No Police Presence      79
Name: force_rank, dtype: int64

In [36]:
df['description'][df['force_rank1'] == "Rank 1 - Police Presence"]

9       During a protest at which both pro-police and Black Lives Matter protesters were present near West 7th and Lincoln, members of the two protests appear to exchange words in an intersection. Police charge a protester, a 14 year old girl according to the poster, tackle her to the ground, kneel on her and arrest her.                                                                                                                                                                                                                   
21      Body cam footage from a police officer shows police approaching person on a bike, believed to be a protester leaving the protest. The two officers pull the protester from the bike to make an arrest, throwing the bike aside. An officer then tells passersby at a convenience store to go home.                                                                                                                                                                        

In [34]:
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)
