In [1]:
#This notebook was made to explore Twitter data and compare current tweets 
#filtered for #police to keywords found in the 846 Police Brutality data

In [2]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [3]:
# required imports

from bs4 import BeautifulSoup
import json
import matplotlib.pyplot as plt
import pandas as pd
import re
import requests
import spacy
from spacy.tokenizer import Tokenizer
import urllib3

##Load and Tokenize 846 data

In [4]:
url="https://api.846policebrutality.com/api/incidents?include=evidence"
http = urllib3.PoolManager()
response = http.request('GET', url)
soup = BeautifulSoup(response.data, "html.parser")

json_846 = json.loads(soup.text)



In [5]:
#Retrieve data from json_846 data key
incidents = json_846['data']

In [6]:
# Create dataframe from the 846 API incident data
df_846 = pd.DataFrame(incidents)

# Change data type for 'date' column to datetime type
df_846['date'] = pd.to_datetime(df_846['date'], infer_datetime_format=True)

# Drop empty columns, description is also empty but needed for current format
# Evidence also isn't needed, as well as id (pub_id needs to be renamed to id)
df_846 = df_846.drop(columns=['data', 'evidence', 'id'])

# Check the top 5 rows of the new dataframe
df_846.head()

Unnamed: 0,pb_id,state,city,date,title,description,links,tags,geocoding
0,or-portland-409,Oregon,Portland,2020-11-08 08:00:00,DHS agents arrest & tear gas protesters over t...,,[https://twitter.com/AdamnCostelloTV/status/13...,"[arrest, journalist, less-lethal, pepper-ball,...","{'lat': '45.5051064', 'long': '-122.6750261'}"
1,or-portland-404,Oregon,Portland,2020-11-04 08:00:00,Officers charged protesters & press,,[https://twitter.com/Cascadianphotog/status/13...,"[journalist, protester, push, shove]","{'lat': '45.5051064', 'long': '-122.6750261'}"
2,or-portland-405,Oregon,Portland,2020-11-04 08:00:00,"Officers shove, strike, and arrest protesters",,[https://twitter.com/ByMikeBaker/status/132420...,"[arrest, baton, protester, push, shove, strike]","{'lat': '45.5051064', 'long': '-122.6750261'}"
3,or-portland-406,Oregon,Portland,2020-11-04 08:00:00,Officers make violent arrests,,[https://twitter.com/TheRealCoryElia/status/13...,"[arrest, protester, push, shove]","{'lat': '45.5051064', 'long': '-122.6750261'}"
4,or-portland-407,Oregon,Portland,2020-11-04 08:00:00,Protester tackled and arrested,,[https://mobile.twitter.com/jovannithe1st/stat...,"[arrest, protester, tackle]","{'lat': '45.5051064', 'long': '-122.6750261'}"


##Use Spacy to tokenize 846 data

In [7]:
#instantiate nlp model
nlp = spacy.load("en_core_web_sm")

#Instantiaiting tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [8]:
#lemmatisation function

def tokenize(doc):
    """ Function tokenzing data and returning a list"""
    lemmas = []
    doc = nlp(doc)
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
            lemmas.append(token.lemma_)
    return lemmas

In [9]:
#need to concatenate tags and title, converting tag to list first

def listToString(s):
  str1 = ""
  for elem in s:
    str1 += elem
    return str1

In [10]:
df_846['tags'] = df_846['tags'].apply(listToString)

In [11]:
df_846['concat'] = df_846['title'] + " " + df_846['tags']

In [12]:
#tokenize column containing both tags and title

df_846['tokens'] = df_846['concat'].apply(tokenize)

In [13]:
#tokens from posts containing examples of police use of force
#Because 846 contains instances of police use of force, these 
#tokens can be used to find similar posts on Twitter or other
#social media

print(df_846.shape)
df_846['tokens'].head(10)

(1261, 11)


0    [DHS, agent, arrest, tear, gas, protester, thr...
1      [officer, charge, protester, press, journalist]
2    [officer, shove, strike, arrest, protester, ar...
3                   [officer, violent, arrest, arrest]
4                  [Protester, tackle, arrest, arrest]
5    [Member, Vice, film, crew, harass, assault, ba...
6            [officer, shoot, protester, head, lethal]
7                 [Police, arrest, journalist, arrest]
8          [Police, charge, arrest, protester, arrest]
9    [Police, assault, hospitalize, protester, arrest]
Name: tokens, dtype: object

In [14]:
# Vector Representation
# Feature extraction

from sklearn.feature_extraction.text import TfidfVectorizer

tdf = TfidfVectorizer(ngram_range=(1,2),
                    max_df=.97,
                     min_df=3,
                    tokenizer=tokenize)
# Fit and Transforming text
vect = tdf.fit_transform(df_846['concat'])
# Making a dataframe of feature names
vect = pd.DataFrame(vect.todense(), columns=tdf.get_feature_names())

vect.head()

Unnamed: 0,abuse,abuse power,active,activist,agent,agent arrest,agent deploy,agent fire,agent shoot,aid,air,air journalist,alleged,allegedly,angeles,angeles law,apartment,apparent,apparent cause,apparent justification,apparent reason,area,arm,arrest,arrest abuse,arrest arrest,arrest bystander,arrest federal,arrest journalist,arrest man,arrest peaceful,arrest pepper,arrest police,arrest protest,arrest protester,arrest reporter,arrest tear,arrest woman,arrestee,assault,...,unarmed man,unarmed protester,unlawful,unmarked,use,use bike,use excessive,use flashbang,use lrad,use pepper,use tear,van,vandalism,vehicle,veteran,violation,violation arrest,violence,violent,violent arrest,violently,violently arrest,violently push,walk,walk away,walk home,warn,water,water bottle,way,way crowd,wheelchair,white,window,woman,woman arrest,woman ground,woman shoot,wound,yell
0,0.0,0.0,0.0,0.0,0.223359,0.316059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.206665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316059,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.251188,0.273182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.358595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.337083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.388436,0.0,0.366777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.387237,0.39604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.393333,0.0,0.371401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##Twitter EDA

Including tweets is challenging because we need to sift through a lot of data to find relevant posts. These tweets were scraped using 'police' as a filter, and the filter could be more customized. 

We need to compare tweet data with the 846 data and remove duplicates. One way to do this is by comparing links, however not every tweet includes a link. Another idea is to compare geo location but not every tweet includes a cooordinates, and more than one event can take place at the same coordinates. 

In [15]:
#freshly scraped tweets

tweet_df = pd.read_csv('tweets.csv')

In [16]:
tweet_df.shape

(1572, 12)

In [17]:
tweet_df.head()

Unnamed: 0,id,user_description,user_location,coordinates,text,geo,user_name,user_created,id_str,created,source,language
0,1,"Latest News, World News, USA News, Technology ...",USA,,Police Chief Who Backed Costs In opposition to...,,LatestN72062740,2020-08-05T09:02:41,1328916595457069057,2020-11-18T04:23:01,https://latestnews.in.net,en
1,2,,"California, USA",,ABSOLUTELY MADDENING!,,RedLola2017,2018-08-17T19:57:20,1328916601047904259,2020-11-18T04:23:02,,en
2,3,"Justin Trudeau is an imbecile, national disgra...",C eh N eh D eh,,@NitraQueen @BrianPallister Economic suicide a...,,scannon_lou,2020-02-23T04:31:57,1328916605150007298,2020-11-18T04:23:03,,en
3,4,I block for logical fallacies. #HumansAreAnimals,"Miami Lakes, FL",,@OScottHartman @allymayn Walter Cronkite looke...,,Streamheat,2009-03-21T16:28:54,1328916607570243584,2020-11-18T04:23:03,,en
4,5,,,,"Shame on you, TH government and who involved w...",,Zwartkat1,2019-11-08T05:49:49,1328916608434073600,2020-11-18T04:23:04,,en


In [18]:
#filter out non-english posts. This can also be done 
# by adjusting the twitter scraper

tweet_df = tweet_df[tweet_df['language'] == 'en']

In [22]:
tweet_df.head()

Unnamed: 0,id,user_description,user_location,coordinates,text,geo,user_name,user_created,id_str,created,source,language
0,1,"Latest News, World News, USA News, Technology ...",USA,,Police Chief Who Backed Costs In opposition to...,,LatestN72062740,2020-08-05T09:02:41,1328916595457069057,2020-11-18T04:23:01,https://latestnews.in.net,en
1,2,,"California, USA",,ABSOLUTELY MADDENING!,,RedLola2017,2018-08-17T19:57:20,1328916601047904259,2020-11-18T04:23:02,,en
2,3,"Justin Trudeau is an imbecile, national disgra...",C eh N eh D eh,,@NitraQueen @BrianPallister Economic suicide a...,,scannon_lou,2020-02-23T04:31:57,1328916605150007298,2020-11-18T04:23:03,,en
3,4,I block for logical fallacies. #HumansAreAnimals,"Miami Lakes, FL",,@OScottHartman @allymayn Walter Cronkite looke...,,Streamheat,2009-03-21T16:28:54,1328916607570243584,2020-11-18T04:23:03,,en
4,5,,,,"Shame on you, TH government and who involved w...",,Zwartkat1,2019-11-08T05:49:49,1328916608434073600,2020-11-18T04:23:04,,en


In [23]:
#tokenize tweets, may be used to find similar headlines

tweet_df['tokens'] = tweet_df['text'].apply(tokenize)

In [24]:
tweet_df['tokens'].head() 

0    [police, Chief, back, cost, opposition, Lawmak...
1                              [ABSOLUTELY, maddening]
2    [@nitraqueen, @brianpallister, economic, suici...
3    [@oscotthartman, @allymayn, Walter, Cronkite, ...
4    [shame, th, government, involve, violent, peop...
Name: tokens, dtype: object

In [25]:
tweet_df.head()

Unnamed: 0,id,user_description,user_location,coordinates,text,geo,user_name,user_created,id_str,created,source,language,tokens
0,1,"Latest News, World News, USA News, Technology ...",USA,,Police Chief Who Backed Costs In opposition to...,,LatestN72062740,2020-08-05T09:02:41,1328916595457069057,2020-11-18T04:23:01,https://latestnews.in.net,en,"[police, Chief, back, cost, opposition, Lawmak..."
1,2,,"California, USA",,ABSOLUTELY MADDENING!,,RedLola2017,2018-08-17T19:57:20,1328916601047904259,2020-11-18T04:23:02,,en,"[ABSOLUTELY, maddening]"
2,3,"Justin Trudeau is an imbecile, national disgra...",C eh N eh D eh,,@NitraQueen @BrianPallister Economic suicide a...,,scannon_lou,2020-02-23T04:31:57,1328916605150007298,2020-11-18T04:23:03,,en,"[@nitraqueen, @brianpallister, economic, suici..."
3,4,I block for logical fallacies. #HumansAreAnimals,"Miami Lakes, FL",,@OScottHartman @allymayn Walter Cronkite looke...,,Streamheat,2009-03-21T16:28:54,1328916607570243584,2020-11-18T04:23:03,,en,"[@oscotthartman, @allymayn, Walter, Cronkite, ..."
4,5,,,,"Shame on you, TH government and who involved w...",,Zwartkat1,2019-11-08T05:49:49,1328916608434073600,2020-11-18T04:23:04,,en,"[shame, th, government, involve, violent, peop..."


##Drop duplicate links

We are using the 846 column to check for duplicates in the twitter dataframe

In [26]:
#pull out new df with links only from 846 to compare with Twitter

df_src = df_846['links']

In [27]:
df_src.head()

0    [https://twitter.com/AdamnCostelloTV/status/13...
1    [https://twitter.com/Cascadianphotog/status/13...
2    [https://twitter.com/ByMikeBaker/status/132420...
3    [https://twitter.com/TheRealCoryElia/status/13...
4    [https://mobile.twitter.com/jovannithe1st/stat...
Name: links, dtype: object

In [28]:
df_src = df_src.explode('src').drop_duplicates()

In [29]:
df_src.shape

(3428,)

In [30]:
df_src.head()

0    https://twitter.com/AdamnCostelloTV/status/132...
1    https://twitter.com/BoopTroopEugene/status/132...
2    https://twitter.com/PDocumentarians/status/132...
3    https://twitter.com/Cascadianphotog/status/132...
4    https://twitter.com/ByMikeBaker/status/1324187...
Name: links, dtype: object

In [31]:
#add list as column
tweet_df['src846'] = df_src

In [91]:
link_drop_df = tweet_df.drop(columns=['user_description','user_location',
                                      'coordinates','text', 'geo', 'user_name',
                                      'user_created', 'language', 'tokens',
                                      'created', 'id_str'])

In [92]:
link_drop_df = link_drop_df.drop_duplicates(['source', 'src846'], keep=False)

In [93]:
link_drop_df = link_drop_df.drop(columns='src846')

In [97]:
df = pd.merge(tweet_df, link_drop_df, how='inner')

In [100]:
df = df.drop(columns='src846')

In [101]:
df.head()

Unnamed: 0,id,user_description,user_location,coordinates,text,geo,user_name,user_created,id_str,created,source,language,tokens
0,1,"Latest News, World News, USA News, Technology ...",USA,,Police Chief Who Backed Costs In opposition to...,,LatestN72062740,2020-08-05T09:02:41,1328916595457069057,2020-11-18T04:23:01,https://latestnews.in.net,en,"[police, Chief, back, cost, opposition, Lawmak..."
1,2,,"California, USA",,ABSOLUTELY MADDENING!,,RedLola2017,2018-08-17T19:57:20,1328916601047904259,2020-11-18T04:23:02,,en,"[ABSOLUTELY, maddening]"
2,3,"Justin Trudeau is an imbecile, national disgra...",C eh N eh D eh,,@NitraQueen @BrianPallister Economic suicide a...,,scannon_lou,2020-02-23T04:31:57,1328916605150007298,2020-11-18T04:23:03,,en,"[@nitraqueen, @brianpallister, economic, suici..."
3,4,I block for logical fallacies. #HumansAreAnimals,"Miami Lakes, FL",,@OScottHartman @allymayn Walter Cronkite looke...,,Streamheat,2009-03-21T16:28:54,1328916607570243584,2020-11-18T04:23:03,,en,"[@oscotthartman, @allymayn, Walter, Cronkite, ..."
4,5,,,,"Shame on you, TH government and who involved w...",,Zwartkat1,2019-11-08T05:49:49,1328916608434073600,2020-11-18T04:23:04,,en,"[shame, th, government, involve, violent, peop..."


In [105]:
#geo and coordinates are very similar and contain a lot of nans
#could possibly filter for posts with both geo or coordinates,
#and also a link before using a classification model to determine
#use of force. 

print(df['geo'].value_counts().sum())
print(df['coordinates'].value_counts().sum())

11
11


In [110]:
df_geo = df[df['geo'].notna()]

In [130]:
df_geo.head()

Unnamed: 0,id,user_description,user_location,coordinates,text,geo,user_name,user_created,id_str,created,source,language,tokens
171,203,Unofficial automated posting of calls for serv...,"Portland, Oregon","{""type"": ""Point"", ""coordinates"": [-122.651502,...",SUSPICIOUS - PRIORITY at NE BROADWAY / NE 14TH...,"{""type"": ""Point"", ""coordinates"": [45.53504, -1...",pdxpolicelog,2013-07-18T07:49:09,1328917353707442177,2020-11-18T04:26:01,http://www.civicapps.org/datasets/911-dispatch...,en,"[SUSPICIOUS, PRIORITY, NE, BROADWAY, NE, 14th,..."
426,516,Unofficial automated posting of calls for serv...,"Portland, Oregon","{""type"": ""Point"", ""coordinates"": [-122.679284,...","SUSPICIOUS SUBJ, VEH, OR CIRCUMSTANCE at 700 S...","{""type"": ""Point"", ""coordinates"": [45.518886, -...",pdxpolicelog,2013-07-18T07:49:09,1328918361737027586,2020-11-18T04:30:02,http://www.civicapps.org/datasets/911-dispatch...,en,"[SUSPICIOUS, SUBJ, VEH, CIRCUMSTANCE, 700, SW,..."
428,519,"Unofficial automated posting of east county, e...","Multnomah County, Oregon","{""type"": ""Point"", ""coordinates"": [-122.392737,...","SHOTS FIRED at 1300 NE BARNES CT, GRSM [Gresha...","{""type"": ""Point"", ""coordinates"": [45.506797, -...",pdxsherrifflog,2013-07-19T22:54:21,1328918366052962307,2020-11-18T04:30:03,http://www.civicapps.org/datasets/911-dispatch...,en,"[SHOTS, FIRED, 1300, NE, BARNES, CT, grsm, Gre..."
429,521,Unofficial automated posting of calls for serv...,"Portland, Oregon","{""type"": ""Point"", ""coordinates"": [-122.724265,...",ACCIDENT - HIT AND RUN - COLD at 5300 N LOMBAR...,"{""type"": ""Point"", ""coordinates"": [45.582874, -...",pdxpolicelog,2013-07-18T07:49:09,1328918368271826945,2020-11-18T04:30:03,http://www.civicapps.org/datasets/911-dispatch...,en,"[ACCIDENT, HIT, run, cold, 5300, N, LOMBARD, S..."
710,863,Unofficial automated posting of calls for serv...,"Portland, Oregon","{""type"": ""Point"", ""coordinates"": [-122.616951,...","THREAT - COLD at 400 SE 44TH AVE, PORT [Portla...","{""type"": ""Point"", ""coordinates"": [45.520117, -...",pdxpolicelog,2013-07-18T07:49:09,1328919369544417280,2020-11-18T04:34:02,http://www.civicapps.org/datasets/911-dispatch...,en,"[threat, cold, 400, SE, 44th, AVE, port, Portl..."
