In [None]:
#This notebook is currently used to explore Reddit and 846PoliceBrutality data

In [None]:
#install praw, the python reddit API wrapper, 
# documentation located here: https://praw.readthedocs.io/en/latest/

!pip install praw

In [None]:
# required imports

from bs4 import BeautifulSoup
import json
import pandas as pd
import praw
import re
import requests
import urllib3

In [None]:
#Reddit API credentials in a vars.env file

reddit = praw.Reddit(client_id = REDDIT_CLIENT_ID,
                     client_secret = REDDIT_SECRET,
                     user_agent = REDDIT_USER_AGENT,
                     username = REDDIT_USERNAME
                     )

In [None]:
# Get the 1,000 newest posts from 2020PoliceBrutality
# removed subreddit column, currently only using one

posts = []

#changed from 'hot' to 'new' to pull newest incidents
df_2020PB = reddit.subreddit('2020PoliceBrutality')
for post in df_2020PB.new(limit=1000):
    posts.append([post.id, post.title, post.url, post.selftext, post.created_utc])
posts = pd.DataFrame(posts,columns=['id', 'title', 'url', 'body', 'created'])

In [None]:
posts.head()

In [None]:
#Current shape 976 incidents

posts.shape

(973, 5)

In [None]:
# current example of post body. Most of the reddit posts have a title, a link 
# to the video or social media post, and no body. However, some of the posts have 
# the body included

posts['body'][33]

'It really looks like most of the cops in videos on this sub are either abusing steroids or stimulants. \n\nI have no idea whether they’re being drug tested at present, but no reasonable person would object to randomly testing them.'

## How does 2020PB compare with 846PoliceBrutality API?

2020PB does not contain as much text in the body of each post as we had hoped for training an NLP model. 
846 has more detailed location data. 

In [None]:
url="https://api.846policebrutality.com/api/incidents?include=evidence"
http = urllib3.PoolManager()
response = http.request('GET', url)
soup = BeautifulSoup(response.data, "html.parser")

json_846 = json.loads(soup.text)
#json_846

In [None]:
#Retrieve data from json_846 data key
incidents = json_846['data']

#look at first incident
incidents[0]

In [None]:
# Create dataframe from the 846 API incident data
df_846 = pd.DataFrame(incidents)

# Change data type for 'date' column to datetime type
df_846['date'] = pd.to_datetime(df_846['date'], infer_datetime_format=True)

# Drop empty columns
df_846 = df_846.drop(columns=['data','description'])

# Check the top 5 rows of the new dataframe
df_846.head()

Unnamed: 0,id,pb_id,state,city,date,title,links,tags,geocoding,evidence
0,bf46b270-1fb8-11eb-b018-5fba53929c9e,wa-seattle-75,Washington,Seattle,2020-11-04 08:00:00,Police arrest bystander in vehicle,[https://twitter.com/onelung_/status/132420651...,"[arrest, bike, bystander, property-destruction]","{'lat': '47.6062095', 'long': '-122.3320708'}",[{'id': 'bf49ace0-1fb8-11eb-8c23-65cd1770af9d'...
1,f4ae69c0-1f9b-11eb-9041-4f6fe2453397,ca-losangeles-63,California,Los Angeles,2020-11-04 08:00:00,Police arrest journalists,[https://twitter.com/desertborder/status/13241...,"[arrest, journalist]","{'lat': '34.0522342', 'long': '-118.2436849'}",[{'id': 'f4b0d780-1f9b-11eb-bedc-4de8e62a0d92'...
2,bf4ee070-1fb8-11eb-bae4-95bdb3987ef2,wa-seattle-76,Washington,Seattle,2020-11-04 08:00:00,Police assault and hospitalize protester,[https://twitter.com/MarcusKulik/status/132425...,"[arrest, protester, punch, tackle]","{'lat': '47.6062095', 'long': '-122.3320708'}",[{'id': 'bf513560-1fb8-11eb-a946-b127f7240486'...
3,e42c6b70-1fb7-11eb-a63f-73653f7163f8,co-denver-29,Colorado,Denver,2020-11-04 08:00:00,Police charge and arrest protesters,[https://twitter.com/__ovas__/status/132421617...,"[arrest, beat, grab, protester, tackle]","{'lat': '39.7392358', 'long': '-104.9902510'}",[{'id': 'e42fbce0-1fb7-11eb-84a7-f9e93658d7b4'...
4,e434ef50-1fb7-11eb-a390-f9bb396d9aae,co-denver-30,Colorado,Denver,2020-11-04 08:00:00,Officer shoots protester in head,[https://twitter.com/__ovas__/status/132421321...,"[less-lethal, pepper-ball, projectile, protest...","{'lat': '39.7392358', 'long': '-104.9902510'}",[{'id': 'e4372b50-1fb7-11eb-8d7f-07d5187b7001'...


In [None]:
df_846.shape

(1259, 10)

Experiment with Model to match current News headlines to 846 tags and headlines

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy
from spacy.tokenizer import Tokenizer

In [None]:
nlp = spacy.load("en_core_web_sm")

# Instantiaiting tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [None]:
def tokenize(doc):
    """ Function tokenzing data and returning a list"""
    lemmas = []
    doc = nlp(doc)
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    return lemmas

In [None]:
#need to concatenate tags and title, converting tag to list first

def listToString(s):
  str1 = ""
  for elem in s:
    str1 += elem
    return str1

In [None]:
df_846['tags'] = df_846['tags'].apply(listToString)

In [None]:
df_846['concat'] = df_846['title'] + " " + df_846['tags']

In [None]:
df_846['tokens'] = df_846['concat'].apply(tokenize)

In [None]:
df_846['tokens'].head(10)

0         [Police, arrest, bystander, vehicle, arrest]
1                 [Police, arrest, journalist, arrest]
2    [Police, assault, hospitalize, protester, arrest]
3          [Police, charge, arrest, protester, arrest]
4            [officer, shoot, protester, head, lethal]
5    [police, kettle, assault, arrest, protester, a...
6         [Police, assault, arrest, protester, arrest]
7      [officer, charge, protester, press, journalist]
8    [officer, shove, strike, arrest, protester, ar...
9                   [officer, violent, arrest, arrest]
Name: tokens, dtype: object

In [None]:
# Vector Representation
# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

tdf = TfidfVectorizer(ngram_range=(1,2),
                    max_df=.97,
                     min_df=3,
                    tokenizer=tokenize)
# Fit and Transforming text
vect = tdf.fit_transform(df_846['concat'])
# Making a dataframe of feature names
vect = pd.DataFrame(vect.todense(), columns=tdf.get_feature_names())

vect.head()

Unnamed: 0,abuse,abuse power,active,activist,agent,agent deploy,agent fire,agent shoot,aid,air,air journalist,alleged,allegedly,angeles,angeles law,apartment,apparent,apparent cause,apparent justification,apparent reason,area,arm,arrest,arrest abuse,arrest arrest,arrest bystander,arrest federal,arrest journalist,arrest man,arrest peaceful,arrest pepper,arrest police,arrest protest,arrest protester,arrest reporter,arrest woman,arrestee,assault,assault arrest,assault protester,...,unarmed man,unarmed protester,unlawful,unmarked,use,use bike,use excessive,use flashbang,use lrad,use pepper,use tear,van,vandalism,vehicle,veteran,violation,violation arrest,violence,violent,violent arrest,violently,violently arrest,violently push,walk,walk away,walk home,warn,water,water bottle,way,way crowd,wheelchair,white,window,woman,woman arrest,woman ground,woman shoot,wound,yell
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.385606,0.0,0.0,0.588585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413466,0.0,0.0,0.0,0.0,0.545472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.459151,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.373557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35042,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Using KNnearestneighbors
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
nn.fit(vect)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [None]:
# Testing with fake headline
fk_headline = "Police use tear gas during protest"
fk_headline_vector = tdf.transform([fk_headline]).todense()
#fk_headline_vector

In [None]:
#Showing array of vectors for fake headline
neighbors = nn.kneighbors(fk_headline_vector)
neighbors

(array([[0.66561049, 0.77912434, 0.8115066 , 0.85690006, 0.89203524,
         0.92105122, 0.95439244, 0.99659684, 1.00241477, 1.01642204]]),
 array([[1119,  908, 1076,  996, 1219,  311,   20,  790, 1182,  194]]))

In [None]:
for line in neighbors[1]:
    print(df_846['title'].iloc[line])

1119                    Police use tear gas on protesters
908     Police use tear gas on protesters peacefully b...
1076                    Police use tear gas to clear road
996     Police use tear gas and flashbangs against pea...
1219    Police use tear gas and riot rounds against pr...
311     Police use tear gas and impact munitions again...
20      Police use tear gas and stun grenades against ...
790     Iowa State Police use tear gas & flashbangs ag...
1182    Police use tear gas & rubber bullets on protes...
194     Police and National Guard use tear gas and LRA...
Name: title, dtype: object


## NewsAPI

Considering using a news API to bring in additional sources of data

In [None]:
#articles with the keyword police from NewsAPI
#everything is limited to the past 24 months

import requests
url = ('http://newsapi.org/v2/everything?'
       'q=police&'
       'q=officer&'
       'q=us&'
       'q=america&'
       #location can be use with top headlines, but not 'everything'
       #'country=us&'  
       'apiKey=NEWS_API_KEY')

http = urllib3.PoolManager()
response = http.request('GET', url)

soup2 = BeautifulSoup(response.data, "html.parser")

In [None]:
newsAPI = json.loads(soup2.text)
newsAPI

In [None]:
reports = newsAPI['articles']
reports[7]

{'author': 'Ed Shanahan',
 'content': 'It was their decision to not file any charges, she said.\r\nThe Buffalo Police Department did not respond to a request for comment. Neither did a spokesman for Mr. Brown, who, according to local media … [+1854 chars]',
 'description': 'The white New York State judge invoked his friendship with the mayor and ties to the police, raising questions about whether his status and race were factors.',
 'publishedAt': '2020-10-16T00:52:17Z',
 'source': {'id': None, 'name': 'New York Times'},
 'title': 'N.Y. Judge Mark Grisanti Shoves Buffalo Police Officer',
 'url': 'https://www.nytimes.com/2020/10/15/nyregion/judge-grisanti-buffalo-police.html',
 'urlToImage': 'https://static01.nyt.com/images/2020/10/15/us/politics/judge-promo/Screen-Shot-2020-10-15-at-3-facebookJumbo-v2.png'}

In [None]:
df_newsAPI = pd.DataFrame(reports)

In [None]:
df_newsAPI['publishedAt'] = pd.to_datetime(df_newsAPI['publishedAt'])
df_newsAPI.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': None, 'name': 'Lifehacker.com'}",Elizabeth Yuko,What's Happening in Nigeria?,After a summer of ongoing protests over police...,https://lifehacker.com/whats-happening-in-nige...,https://i.kinja-img.com/gawker-media/image/upl...,2020-10-26 17:00:00+00:00,After a summer of ongoing protests over police...
1,"{'id': None, 'name': 'New York Times'}",Johnny Diaz,Two Police Officers Are Shot in Houston,Chief Art Acevedo of the Houston police said a...,https://www.nytimes.com/2020/10/20/us/houston-...,https://static01.nyt.com/images/2020/10/20/mul...,2020-10-20 16:07:59+00:00,Two police officers were struck by gunfire at ...
2,"{'id': 'bbc-news', 'name': 'BBC News'}",https://www.facebook.com/bbcnews,"Several injured in Vienna shooting, police say",One attacker is dead and another on the run as...,https://www.bbc.co.uk/news/world-europe-54786952,https://ichef.bbci.co.uk/news/1024/branded_new...,2020-11-02 20:47:09+00:00,image captionA large-scale police operation is...
3,"{'id': None, 'name': 'New York Times'}",Michael Levenson,Waukegan Police Killing of Black Man Sets Off ...,"The police in Waukegan, Ill., said Marcellis S...",https://www.nytimes.com/2020/10/22/us/waukegan...,https://static01.nyt.com/images/2020/10/22/mul...,2020-10-23 00:19:02+00:00,The shooting came five months after George Flo...
4,"{'id': 'bbc-news', 'name': 'BBC News'}",,Covid: Protesters clash with police in Italy,"Gyms, swimming pools, cinemas and theatres are...",https://www.bbc.co.uk/news/av/world-europe-547...,https://ichef.bbci.co.uk/images/ic/400xn/p08wk...,2020-10-27 14:44:02+00:00,Protests broke out across Italy on Monday over...


In [None]:
df_newsAPI.shape

(20, 9)

In [None]:
df_newsAPI['description'][7]

'The white New York State judge invoked his friendship with the mayor and ties to the police, raising questions about whether his status and race were factors.'

In [None]:
df_newsAPI['content'][7]

'It was their decision to not file any charges, she said.\r\nThe Buffalo Police Department did not respond to a request for comment. Neither did a spokesman for Mr. Brown, who, according to local media … [+1854 chars]'

In [None]:
df_newsAPI['title'][7]

'N.Y. Judge Mark Grisanti Shoves Buffalo Police Officer'

In [None]:
df_newsAPI['concat'] = df_newsAPI['title'] + " " + df_newsAPI['description'] + " " + df_newsAPI['content']

In [None]:
df_newsAPI['concat'][7]

'N.Y. Judge Mark Grisanti Shoves Buffalo Police Officer The white New York State judge invoked his friendship with the mayor and ties to the police, raising questions about whether his status and race were factors. It was their decision to not file any charges, she said.\r\nThe Buffalo Police Department did not respond to a request for comment. Neither did a spokesman for Mr. Brown, who, according to local media … [+1854 chars]'

##Can we compare new headlines to the 846 headlines? 

In [None]:
#tag, title and content info from single NewsAPI article

newsAPI_headline = df_newsAPI['concat'][7]

df_newsAPI_vector = tdf.transform([newsAPI_headline]).todense()
#df_newsAPI_vector

In [None]:
# Showing array of vectors for single news article

neighbors2 = nn.kneighbors(df_newsAPI_vector)
neighbors2

(array([[1.1269077 , 1.14104592, 1.17851569, 1.1823841 , 1.19284981,
         1.20108041, 1.20108041, 1.20624614, 1.22555252, 1.23280131]]),
 array([[ 891,  308,   91,  772,  624,  144, 1247, 1146,  919,  701]]))

In [None]:
#Match the most similar headlines from 846 data

for line in neighbors2[1]:
    print(df_846['title'].iloc[line])

891              Officer incites violence on social media
308                    Police pepper spray media personel
91            Police arrest Kentucky State Representative
772               White-shirted officers shove protesters
624     Woman struck by white supremacist while police...
144                       Police officer shoves protester
1247                     Police officer shoves protesters
1146                      Police officer shoves protester
919               Police violence outside the White House
701     Police officer stops abrubtly so protester bum...
Name: title, dtype: object
