In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import spacy
import en_core_web_sm
import pandas as pd

In [4]:
ranked_reports = {
    "Rank 1 - Police Presence": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
    ],
    "Rank 2 - Empty-hand": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "pushed and shoved with shields",
        "grabs, holds and joint locks",
        "punch and kick",
    ],
    "Rank 3 - Blunt Force": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "rubber bullets",
        "riot rounds",
        "batons",
    ],
    "Rank 4 - Chemical & Electric": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "tear gas",
        "pepper spray",
        "flashbangs, stun grenade",
        "chemical sprays",
        "Conducted energy devices, CED or tazor",
    ],
    "Rank 5 - Lethal Force": [
        "policeman, policewoman, law enforcement",
        "police officer, cop, five-o, fuzz, DHS",
        "shoot and kill",
        "open fire",
        "deadly force",
        "fatal",
        "dies",
    ],
}

In [14]:
class TextMatcher:
    """ Generic NLP Text Matching Model """

    class Tokenizer:
        """ Standard SpaCy Tokenizer """
        nlp = en_core_web_sm.load()

        def __call__(self, text: str) -> list:
            return [
                token.lemma_ for token in self.nlp(text)
                if not token.is_stop and not token.is_punct
            ]

    def __init__(self, train_data: dict, ngram_range=(1, 3), max_features=8000):
        """ Model training on live data at init """
        self.lookup = {k: ' '.join(v) for k, v in train_data.items()}
        self.name_index = list(self.lookup.keys())
        self.tfidf = TfidfVectorizer(
            ngram_range=ngram_range,
            tokenizer=self.Tokenizer(),
            max_features=max_features,
        )
        self.knn = NearestNeighbors(
            n_neighbors=1,
            n_jobs=-1,
        ).fit(self.tfidf.fit_transform(self.lookup.values()).todense())
        self.baseline, _ = self._worker('')

    def _worker(self, user_input: str):
        """ Prediction worker method - internal only """
        vec = self.tfidf.transform([user_input]).todense()
        return (itm[0][0] for itm in self.knn.kneighbors(vec))

    def __call__(self, user_input: str) -> str:
        """ Callable object for making predictions """
        dist, idx = self._worker(user_input)
        if dist != self.baseline:
            return self.name_index[int(idx)]
        else:
            return 'Rank 0 - No Police Presence'

In [15]:
textmatcher = TextMatcher(ranked_reports)

In [18]:
df = pd.read_csv("combined_tweets.csv")

In [20]:
df.head()

Unnamed: 0,ids,text,reddit
0,1266136557871869952,Police in NYC made several arrests during a pr...,1
1,1266159669262893057,Calls for justice for George Floyd. Protesters...,1
2,1266555286678048770,NYPD just casually slamming a dude with a car ...,1
3,1266540710188195843,Update: Got her permission with a fuck yeah. T...,1
4,1266529475757510656,NYPD officer just called a female protester a ...,1


In [28]:
text = df['text'][4]

print(textmatcher(text), text)

Rank 1 - Police Presence NYPD officer just called a female protester a “stupid fucking bitch” and threw her to the ground
