## Install modules

In [1]:
!pip install praw
!pip3 install newspaper3k

Collecting praw
[?25l  Downloading https://files.pythonhosted.org/packages/2c/15/4bcc44271afce0316c73cd2ed35f951f1363a07d4d5d5440ae5eb2baad78/praw-7.1.0-py3-none-any.whl (152kB)
[K     |██▏                             | 10kB 18.0MB/s eta 0:00:01[K     |████▎                           | 20kB 1.7MB/s eta 0:00:01[K     |██████▌                         | 30kB 2.3MB/s eta 0:00:01[K     |████████▋                       | 40kB 2.6MB/s eta 0:00:01[K     |██████████▊                     | 51kB 2.0MB/s eta 0:00:01[K     |█████████████                   | 61kB 2.3MB/s eta 0:00:01[K     |███████████████                 | 71kB 2.5MB/s eta 0:00:01[K     |█████████████████▎              | 81kB 2.7MB/s eta 0:00:01[K     |███████████████████▍            | 92kB 2.9MB/s eta 0:00:01[K     |█████████████████████▌          | 102kB 2.8MB/s eta 0:00:01[K     |███████████████████████▊        | 112kB 2.8MB/s eta 0:00:01[K     |█████████████████████████▉      | 122kB 2.8MB/s eta 0:00:01

## Import functions and grabbing reddit data

In [55]:
import praw
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from newspaper import Article
import nltk
nltk.download('punkt')
# Reddit credentials, password stored in .env 
# PRAW setup goes here

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
# Grabbing 100 hottest posts on Reddit at the moment. Will filter for police use of force later

data = []

# other possible subreddits: publicfreakout, allcopnodonut
for submission in reddit.subreddit("news").hot(limit=100):
  data.append([submission.id, submission.title, submission.url])

# We'll need a way to get coordinates for a given post, before we include that in df
col_names = ['id', 'title', 'url']
df = pd.DataFrame(data, columns=col_names)

df.head()

Unnamed: 0,id,title,url
0,ilsnn1,Rochester police officer tells activist she's ...,https://www.wxxinews.org/post/rochester-police...
1,iltx3x,Drone drops hundreds of bags of cannabis in Te...,https://www.jpost.com/israel-news/drone-drops-...
2,ill2g8,U.S. court: Mass surveillance program exposed ...,https://www.reuters.com/article/us-usa-nsa-spy...
3,ilqqqf,US gives first-ever OK for small commercial nu...,https://apnews.com/910766c07afd96fbe2bd875e160...
4,ilg8xj,COVID-19 has killed more police officers this ...,https://www.seattletimes.com/nation-world/covi...


## Deprecated tag based filtering system

In [63]:
# set up some sort of corpus of keywords to snag specific reddit entries
all_locs = pd.read_json('https://raw.githubusercontent.com/2020PB/police-brutality/data_build/all-locations-v2.json')
all_locs = pd.json_normalize(all_locs['data'])
all_locs = all_locs.drop(columns=['edit_at','id'])
def cleanlinks(json):
    links_out = []
    for link in json:
        links_out.append(link['url'])
    return links_out
all_locs['links'] = all_locs['links'].apply(cleanlinks)
all_locs['date'] = pd.to_datetime(all_locs['date'],format='%Y-%m-%d')
all_tags = all_locs['tags'].copy()
tags = set()
for taglist in all_tags:
  for tag in taglist:
    if tag not in tags:
      tags.add(tag)
# manually remove needless tags
print(tags)
tags.discard('')
tags.discard('medic')
tags.discard('bike')
tags.discard('non-protest')
tags.discard('pregnant')
tags.discard('lgbtq+')
tags.discard('racial-profiling')
tags.discard('legal-observer')
tags.discard('tear-gas-canister')
tags.discard('politician')
tags.discard('incitement')
tags.discard('homeless')
tags.discard('shoot')
tags.discard('strike')
tags.discard('elderly')
tags.discard('vehicle')
tags.discard('inhumane-treatment')
tags.discard('journalist')
tags.discard('throw')
tags.discard('explosive')
tags.discard('threaten')
tags.discard('horse')
tags.discard('shove')
tags.discard('child')
tags.discard('shield')
tags.discard('dog')
tags.discard('knee')
tags.discard('protester')
tags.discard('gun')
tags.discard('conceal')
tags.discard('bystander')
tags.discard('grab')
tags.discard('push')
tags.discard('zip-tie')
tags.discard('spray')
tags.discard('drive')
tags.discard('person-with-disability')
tags.discard('celebrity')
tags.discard('projectile')
tags.discard('beat')
newtags = set()
for tag in tags:
  tag = newtags.add(tag.replace('-',' '))
tags = newtags
print(tags)

{'', 'knee-on-neck', 'baton', 'tear-gas-canister', 'property-destruction', 'throw', 'beat', 'journalist', 'lgbtq+', 'shield', 'foam-bullet', 'bean-bag', 'tase', 'death', 'lrad', 'elderly', 'knee', 'zip-tie', 'wooden-bullet', 'tear-gas', 'politician', 'kick', 'bystander', 'shove', 'incitement', 'non-protest', 'spray', 'tackle', 'conceal', 'pregnant', 'grab', 'horse', 'punch', 'protester', 'inhumane-treatment', 'push', 'celebrity', 'abuse-of-power', 'pepper-spray', 'legal-observer', 'body-cam', 'rubber-bullet', 'live-round', 'child', 'marking-round', 'drive', 'vehicle', 'homeless', 'sexual-assault', 'person-with-disability', 'hide-badge', 'taser', 'headlock', 'gas', 'gun', 'choke', 'strike', 'bike', 'threaten', 'arrest', 'medic', 'pepper-ball', 'explosive', 'dog', 'shoot', 'projectile', 'racial-profiling', 'mace', 'stun-grenade'}
{'sexual assault', 'baton', 'knee on neck', 'foam bullet', 'death', 'stun grenade', 'tase', 'lrad', 'kick', 'tackle', 'abuse of power', 'punch', 'body cam', 'be

In [None]:
# get the url of the reddit post
for url in df['url']:
  # get the HTML from the url
  try:
    r = requests.get(url, timeout=10)
  except:
    continue
  soup = BeautifulSoup(r.text)
  # get tags from metadata for the site
  sitetags = set()
  for meta in soup.find_all('meta'):
    if meta is None:
      continue
    meta = str(meta)
    meta = meta.lower()
    meta = re.sub('[\W_]+',' ', meta)
    for tag in str(meta).split():
      sitetags.add(tag)
  tags_final = sitetags & tags
  if tags_final:
    # some matches found, print title and matched tags
    print(soup.title.text.strip(), tags_final)

Deputy involved in dog bite arrest ordered strip searches on women {'arrest'}
Gunfire in a Tallahassee parking lot leads to arrest of armed couple, complaints of racism {'arrest'}


## Old `requests` based text grabbing function

In [None]:
# Test grabbing the contents of the articles themselves to imporve spaCy NLP
# get the url of the reddit post
content_list = []
df_snip = df['url']
for id_url in df_snip:
  # get the HTML from the url
  try:
    r = requests.get(id_url, timeout=10)
  except:
    content_list.append('')
    continue
  soup = BeautifulSoup(r.text)
  # get text from website
  output_text = " ".join([x.text for x in soup.find_all('p')])
  output_text = re.sub("[^a-zA-Z0-9.,']+", ' ', output_text).strip()
  content_list.append(output_text)

print(df.shape, len(content_list))
df['text'] = content_list
df = df[df['text'] != '']
df.head()

## New `newspaper3k` based text extraction system

In [57]:
# set up future columns
content_list = []
date_list = []
tokens_list = []
df_snip = df['url']
# go through each URL and use newspaper3k to extract data
for id_url in df_snip:
  # use newspaper3k to extract text
  article = Article(id_url)
  article.download()
  # if the article doesn't download, the error is thrown in parse()
  try:
    article.parse()
  except:
    # add null values to show no connection
    content_list.append(None)
    date_list.append(None)
    continue
  content_list.append(article.text)
  # this will be null if newspaper3k can't find it
  date_list.append(article.publish_date)
df['text'] = content_list
df['date'] = date_list

In [58]:
# show results
print(df.shape)
df.head()

(100, 5)


Unnamed: 0,id,title,url,text,date
0,ilsnn1,Rochester police officer tells activist she's ...,https://www.wxxinews.org/post/rochester-police...,News of the death of Daniel Prude after he was...,
1,iltx3x,Drone drops hundreds of bags of cannabis in Te...,https://www.jpost.com/israel-news/drone-drops-...,A drone drops hundreds of bags of cannabis in ...,
2,ill2g8,U.S. court: Mass surveillance program exposed ...,https://www.reuters.com/article/us-usa-nsa-spy...,FILE PHOTO: Edward Snowden gestures as he spea...,2020-09-03 05:21:24+00:00
3,ilqqqf,US gives first-ever OK for small commercial nu...,https://apnews.com/910766c07afd96fbe2bd875e160...,"BOISE, Idaho (AP) — U.S. officials have for th...",2020-09-02 21:53:55+00:00
4,ilg8xj,COVID-19 has killed more police officers this ...,https://www.seattletimes.com/nation-world/covi...,,


In [59]:
# show losses
df.isnull().sum()

id        0
title     0
url       0
text      5
date     26
dtype: int64

In [60]:
# Remove all entries with missing data
df = df.dropna()
print(df.shape)
df = df.reset_index()
df = df.drop(columns='index')
df.head()

(74, 5)


Unnamed: 0,id,title,url,text,date
0,ill2g8,U.S. court: Mass surveillance program exposed ...,https://www.reuters.com/article/us-usa-nsa-spy...,FILE PHOTO: Edward Snowden gestures as he spea...,2020-09-03 05:21:24+00:00
1,ilqqqf,US gives first-ever OK for small commercial nu...,https://apnews.com/910766c07afd96fbe2bd875e160...,"BOISE, Idaho (AP) — U.S. officials have for th...",2020-09-02 21:53:55+00:00
2,ilrt2x,Investigation into Chinese shipments leads to ...,https://www.masslive.com/news/2020/09/investig...,An investigation into items shipped from China...,2020-09-02 15:23:11.546000+00:00
3,iletck,Boston police officers allegedly committed ove...,https://www.boston.com/news/crime/2020/09/02/b...,Nine current and former Boston Police officers...,2020-09-02 00:00:00
4,ilsl4z,Zimbabwe: Chinese Invade Hwange National Park ...,https://allafrica.com/stories/202009030350.html,Two Chinese companies are reportedly exploring...,2020-09-03 09:25:53+00:00


## Gather location information

In [61]:
locs_df = pd.read_csv('cities_states.csv')
def lowerify(text):
  return text.lower()
locs_df = locs_df.drop(columns=['Unnamed: 0','country'])
locs_df['city_ascii'] = locs_df['city_ascii'].apply(lowerify)
locs_df['admin_name'] = locs_df['admin_name'].apply(lowerify)
locs_df.head()

Unnamed: 0,city_ascii,admin_name,lat,lng
0,new york,new york,40.6943,-73.9249
1,los angeles,california,34.1139,-118.4068
2,chicago,illinois,41.8373,-87.6862
3,miami,florida,25.7839,-80.2102
4,dallas,texas,32.7936,-96.7662


## Old `collections` based approach

In [44]:
from collections import Counter
import re


# get list of states
states_list = list(locs_df.admin_name.unique())
states_map = {}
# for each state, map their respective cities
for state in states_list:
  states_map[state] = locs_df[locs_df['admin_name'] == state]['city_ascii'].to_list()

# get a list of tokens from the text
test_tokens = re.sub('[\W]+',' ',df['text'][3]).lower().split()

# put the tokens into a Counter
c = Counter(test_tokens)

# Check, for each state, which ones come back with a value of more than one
state_counts = {}
for state in states_list:
  if c[state] > 0:
    state_counts[state] = c[state]

print(test_tokens[:20])
print(state_counts)

city_max = max(state_counts, key=state_counts.get)
city_counts = {}
for city in states_map[city_max]:
  if c[city] > 0:
    city_counts[city] = c[city]

print(states_map[city_max])
print(city_counts)

['boise', 'idaho', 'ap', 'u', 's', 'officials', 'have', 'for', 'the', 'first', 'time', 'approved', 'a', 'design', 'for', 'a', 'small', 'commercial', 'nuclear', 'reactor']
{'utah': 5, 'idaho': 5}
['salt lake city', 'ogden', 'provo', 'west valley city', 'st. george', 'west jordan', 'logan', 'orem', 'taylorsville', 'kearns', 'midvale', 'white city', 'sandy', 'layton', 'south jordan', 'lehi', 'millcreek', 'murray', 'draper', 'bountiful', 'riverton', 'spanish fork', 'herriman', 'pleasant grove', 'roy', 'cedar city', 'tooele', 'cottonwood heights', 'springville', 'eagle mountain', 'kaysville', 'clearfield', 'holladay', 'saratoga springs', 'american fork', 'syracuse', 'magna', 'washington', 'south salt lake', 'farmington', 'heber', 'clinton', 'north salt lake', 'hurricane', 'payson', 'vernal', 'north ogden', 'brigham city', 'highland', 'centerville', 'south ogden', 'park city', 'west haven', 'bluffdale', 'price', 'santaquin', 'smithfield', 'woods cross', 'tremonton', 'lindon', 'north logan', 

## New `spacy` based location extraction method

In [69]:
import spacy


# prep spacy
nlp = spacy.load('en_core_web_sm')
# for each article, perform NLP on its text
tokens_list = []
for text in df['text']:
  doc = nlp(text)

  ents = [(e.text, e.label_) for e in doc.ents if e.label_ == 'GPE']
  tokens_list.append(ents)

df['tokens'] = tokens_list

In [74]:
# what's the results?
df['tokens'][2]

[('China', 'GPE'),
 ('Massachusetts', 'GPE'),
 ('the United States', 'GPE'),
 ('China', 'GPE'),
 ('China', 'GPE'),
 ('Massachusetts', 'GPE'),
 ('Massachusetts', 'GPE'),
 ('Syracuse', 'GPE'),
 ('New York', 'GPE'),
 ('U.S.', 'GPE'),
 ('Massachusetts', 'GPE'),
 ('U.S.', 'GPE'),
 ('China', 'GPE'),
 ('Springfield', 'GPE'),
 ('Massachusetts', 'GPE'),
 ('U.S.', 'GPE'),
 ('China', 'GPE'),
 ('China', 'GPE'),
 ('Swansea', 'GPE'),
 ('East Bridgewater', 'GPE'),
 ('China', 'GPE'),
 ('Wrentham', 'GPE'),
 ('the U.S. Mail', 'GPE'),
 ('Winthrop', 'GPE'),
 ('Grasso', 'GPE'),
 ('Pagliuca', 'GPE'),
 ('Suffolk County', 'GPE'),
 ('Worcester', 'GPE'),
 ('China', 'GPE'),
 ('marijuana', 'GPE')]