** Labs28 Notebook for creating merged dataset.**

In [None]:
# Install newspaper3k for article parser
! pip3 install newspaper3k



In [None]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from collections import Counter
from newspaper import Article
import json
import re
import requests
import spacy
from spacy.tokenizer import Tokenizer
import urllib3

nlp = spacy.load('en_core_web_sm')

## First dataframe created from the github r/Police Brutality 2020 page.

In [None]:
# Import aggregated json data create to dataframe
all_locs = 'https://raw.githubusercontent.com/2020PB/police-brutality/data_build/all-locations-v2.json'

# Copy and paste link in url to see current update from Github 2020PB reddit page
df_gitjson = pd.read_json(all_locs)

# Pull data column out and create its own dataframe
df_2020PB = pd.json_normalize(data=df_gitjson['data'])
df_2020PB['updated_at'] = df_gitjson['updated_at']

# Create a last updated to save in .csv filename
last_updated = df_gitjson['updated_at'].iloc[0]

### Create a preprocessing function for df_2020PB

# Rename columns
df_2020PB.rename(columns = {'name':'title'}, inplace = True) 

# Drop irrelevant columns
df_2020PB.drop(labels=['edit_at', 'date_text'], axis=1,inplace=True)

# Reorder column headers
df_2020PB = df_2020PB[['date', 'links', 'id', 'city', 'state', 'geolocation', 'title', 'tags', 'description']]

# Update the "date" column to timestamps
df_2020PB['date'] = pd.to_datetime(df_2020PB['date'],format='%Y-%m-%d')


# Write function to create hyperlinks for the 'links' columns
def cleanlinks(json):
    links_out = []
    for link in json:
        links_out.append(link['url'])
    return links_out


# Apply function to the dataframe 'links' column
df_2020PB['links'] = df_2020PB['links'].apply(cleanlinks)

# Ensure that dataframe was created correctly
df_2020PB

Unnamed: 0,date,links,id,city,state,geolocation,title,tags,description
0,2020-05-31,[https://www.wyff4.com/article/tear-gas-rubber...,nc-asheville-2,Asheville,North Carolina,"35.5909699, -82.5737781",Police fire tear gas and rubber bullets at pro...,"[child, less-lethal, protester, rubber-bullet,...",According to eyewitness accounts and news repo...
1,2020-06-02,[https://twitter.com/AngelaMWilhelm/status/126...,nc-asheville-1,Asheville,North Carolina,"35.5948849, -82.5523578",Police surround approved medical station and d...,"[medic, property-destruction]","Police destroy supplies, including food and wa..."
2,NaT,[https://twitter.com/greg_doucette/status/1270...,nc-asheville-3,Asheville,North Carolina,"35.5903468, -82.574270",Man shot in head with pepper ball,"[less-lethal, pepper-ball, protester, shoot]",A man was shot in the head at close range with...
3,2020-08-09,"[https://ashevilleblade.com/?p=3904, https://t...",nc-asheville-4,Asheville,North Carolina,"35.5950462, -82.5564847",Reporter for The Blade arrested at protest,"[arrest, inhumane-treatment, journalist, zip-tie]",A journalist working for The Asheville Blade w...
4,2020-06-02,"[https://www.instagram.com/tv/CA9UOKAj7MC/, ht...",nc-charlotte-1,Charlotte,North Carolina,"35.2254977, -80.8452494",Law enforcement close on protesters from both ...,"[less-lethal, pepper-ball, protester, stun-gre...",Law enforcement close on protesters from both ...
...,...,...,...,...,...,...,...,...,...
1249,2020-11-04,[https://mobile.twitter.com/jovannithe1st/stat...,or-portland-407,Portland,Oregon,"45.5216957, -122.6801068",Protester tackled and arrested,"[arrest, protester, tackle]",Members of the Oregon State Police charged an ...
1250,2020-11-04,[https://twitter.com/misstessowen/status/13242...,or-portland-408,Portland,Oregon,"45.5183617, -122.6815986",Member of Vice film crew harassed and assaulted,"[baton, grab, journalist, push, shove, strike,...",A VICE film crew was harassed by members of th...
1251,2020-11-08,[https://twitter.com/AdamnCostelloTV/status/13...,or-portland-409,Portland,Oregon,"45.4927916, -122.6726079",DHS agents arrest & tear gas protesters over t...,"[arrest, journalist, less-lethal, pepper-ball,...","Shortly after midnight on November 8th, DHS ag..."
1252,2020-06-04,[https://twitter.com/greg_doucette/status/1268...,or-salem-1,Salem,Oregon,,"Before firing tear gas, Salem PD warns white a...","[inhumane-treatment, less-lethal, tear-gas]",A police officer is seen warning white armed m...


In [None]:
# Extract and clean the data from the 846 API
# https://incidents.846policebrutality.com/

url="https://api.846policebrutality.com/api/incidents"
# Copy and paste link in url to see current update from 846
http = urllib3.PoolManager()
response = http.request('GET', url)
soup = BeautifulSoup(response.data, "html.parser")

json_846 = json.loads(soup.text)

# Check length of the json_846 file
# print(len(json_846['data']))
# json_846  # Commented to see the json_846 object


# Retrieve data from the json_846['data'] key
# Create dataframe from the 846 API incident data
df_846 = pd.DataFrame(json_846['data'])

### Preprocessing

# Change data type for 'date' column to datetime type
df_846['date'] = pd.to_datetime(df_846['date'], infer_datetime_format=True)

# Drop irrelevant columns
df_846 = df_846.drop(columns=['data','pb_id'])

# Rename Columns
df_846.rename(columns = {'geocoding': 'geolocation'}, inplace = True) 

# Reorder columns
df_846 = df_846[['date', 'links', 'id', 'city', 'state', 'geolocation', 'title',
       'tags']]

# Check the dataframe
df_846



Unnamed: 0,date,links,id,city,state,geolocation,title,tags
0,2020-11-08 08:00:00,[https://twitter.com/AdamnCostelloTV/status/13...,4433bf00-23a8-11eb-aecf-cd6a37490b61,Portland,Oregon,"{'lat': '45.5051064', 'long': '-122.6750261'}",DHS agents arrest & tear gas protesters over t...,"[arrest, journalist, less-lethal, pepper-ball,..."
1,2020-11-04 08:00:00,[https://twitter.com/Cascadianphotog/status/13...,3b900fa0-1fb7-11eb-9f22-1d1e378b7123,Portland,Oregon,"{'lat': '45.5051064', 'long': '-122.6750261'}",Officers charged protesters & press,"[journalist, protester, push, shove]"
2,2020-11-04 08:00:00,[https://twitter.com/ByMikeBaker/status/132420...,3b99a900-1fb7-11eb-958f-817385610795,Portland,Oregon,"{'lat': '45.5051064', 'long': '-122.6750261'}","Officers shove, strike, and arrest protesters","[arrest, baton, protester, push, shove, strike]"
3,2020-11-04 08:00:00,[https://twitter.com/TheRealCoryElia/status/13...,3ba283a0-1fb7-11eb-9b56-5f23b4c8f138,Portland,Oregon,"{'lat': '45.5051064', 'long': '-122.6750261'}",Officers make violent arrests,"[arrest, protester, push, shove]"
4,2020-11-04 08:00:00,[https://mobile.twitter.com/jovannithe1st/stat...,3baf70a0-1fb7-11eb-a425-3d2bf03262cb,Portland,Oregon,"{'lat': '45.5051064', 'long': '-122.6750261'}",Protester tackled and arrested,"[arrest, protester, tackle]"
...,...,...,...,...,...,...,...,...
1256,2020-05-26 07:00:00,[https://www.facebook.com/damicedsota.thespiri...,653ee2b0-156b-11eb-b242-f9e25f6eb571,Minneapolis,Minnesota,"{'lat': '44.9777530', 'long': '-93.2650108'}",Man has his gun confiscated in an open carry s...,"[abuse-of-power, arrest, protester]"
1257,2020-05-26 07:00:00,[https://www.facebook.com/1462345700/posts/102...,654403b0-156b-11eb-a46e-fb762f48fab1,Minneapolis,Minnesota,"{'lat': '44.9777530', 'long': '-93.2650108'}",Police shoot flashbang grenades into crowd,"[less-lethal, protester, rubber-bullet, stun-g..."
1258,1900-01-01 08:00:00,[https://twitter.com/greg_doucette/status/1270...,60b50970-156b-11eb-a40c-4d194d71e998,Asheville,North Carolina,"{'lat': '35.5950581', 'long': '-82.5514869'}",Man shot in head with pepper ball,"[less-lethal, pepper-ball, protester, shoot]"
1259,1900-01-01 08:00:00,[https://twitter.com/DomSkyeRN/status/12701854...,80dcd230-156b-11eb-8917-a141e15545d1,Houston,Texas,"{'lat': '29.7604267', 'long': '-95.3698028'}",Officer pushs protester that is filming,"[arrest, property-destruction, protester, push..."


###What is the difference between the information recieved in the first dataframe and the second. Is there any duplicate links relaying the same information?


In [None]:
# 846 API already comes in order
df_846['date'][0]

Timestamp('2020-11-08 08:00:00')

# Created the merged dataframes from the r/2020PB Data and the 846 API

In [None]:
print(f'df_2020PB shape: {df_2020PB.shape}')
print(f'df_846: {df_846.shape}')
print(f'There will be a total of {df_2020PB.shape[0] + df_846.shape[0]} rows.')

df_2020PB shape: (1254, 9)
df_846: (1261, 8)
There will be a total of 2515 rows.


In [None]:
# Merge the two datasets and check for duplicates
frames = [df_2020PB, df_846]
merged_dfs = pd.concat(frames)
# merged_dfs.reset_index(inplace=True)     # Need to properly reset index
print(f'There are currently {merged_dfs.shape[0]}.')
merged_dfs.drop_duplicates(subset=["id"])
print(f'Now, there are currently {merged_dfs.shape[0]} after dropping duplicate ids.')

There are currently 2515.
Now, there are currently 2515 after dropping duplicate ids.


In [None]:
# Sort by date 
merged_dfs.sort_values(by='date', inplace=True)

# Replace the Nan values with the string "None" in the description column   *****************
merged_dfs['description'].replace({np.NaN: "None"}, inplace=True)

# Replace the Nan values with the string "None" in the geolocation column   *****************
merged_dfs['geolocation'].replace({"": np.NaN}, inplace=True)  
# Missing geolocations are mapped as empty strings
merged_dfs['geolocation'].replace({np.NaN: "None"}, inplace=True)

# Removed Outliers by dates outide of the year 2020.
merged_dfs =merged_dfs.loc[merged_dfs["date"].between('2020-01-01', '2020-12-30')]

# Reset index
merged_dfs.reset_index(inplace=True)

# Create a latitude (lat) and longitude (lon) column.
# Create function to create lat and long from geolocation column
def splitGeolocation(item):
    """
    Creates two new columns (lat and lon) by separating the dictionaries of 
    geolocations into latitiude and longitude. 

    :col: indexed slice of a column consisting of dictionaries/strings with 
            latitiude and longitude integers
    :return: latitude column 
    :return: longitude column
    """

    lat = []
    lon = []


    if isinstance(item,str) and item != 'None':
        item = item.split(',')
        lat.append(float(item[0]))
        lon.append(float(item[1]))

    elif type(item) == dict:
        lat.append(float(item['lat']))
        lon.append(float(item['long']))

    else:
        lat.append("None")  ### Null values
        lon.append("None")  ###  Null values

    return lat,lon


merged_dfs['lat'] = [splitGeolocation(item)[0][0] for item in merged_dfs['geolocation']]
merged_dfs['long'] = [splitGeolocation(item)[1][0] for item in merged_dfs['geolocation']]

# Drop the geolocation column
merged_dfs.drop(labels=['geolocation'], axis=1, inplace=True)

# Look at dataframe
merged_dfs = merged_dfs[['date', 'links', 'id', 'city', 'state', 'lat', 'long', 
                         'title', 'description', 'tags']]

###**[X] Decide what format the geolocation column  needs to be. Should the current column have all dicts and two new column be created one for lat and one for lon and ints?**

We decided to create two columns each containing floats for longitude/latitude
values and insert NaNs where no values exist. Dropped the geolocation column
since it wasn't being used on the front-end to populate any real data. 

In [None]:
merged_dfs

Unnamed: 0,date,links,id,city,state,lat,long,title,description,tags
0,2020-05-26 00:00:00,[https://www.facebook.com/damicedsota.thespiri...,mn-minneapolis-28,Minneapolis,Minnesota,44.9413,-93.2626,Man has his gun confiscated in an open carry s...,Man encounters police arresting people open ca...,"[abuse-of-power, arrest, protester]"
1,2020-05-26 00:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",mn-minneapolis-21,Minneapolis,Minnesota,44.9479,-93.2349,Police hit press in neck and head with wooden ...,A group of cops start to approach a group of p...,"[baton, beat, journalist, strike]"
2,2020-05-26 00:00:00,[https://www.facebook.com/1462345700/posts/102...,mn-minneapolis-14,Minneapolis,Minnesota,44.9481,-93.237,Police shoot flashbang grenades into crowd,Police on the rooftop of the 3rd precinct fire...,"[less-lethal, protester, rubber-bullet, stun-g..."
3,2020-05-26 07:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",6537b170-156b-11eb-9a05-43449b932225,Minneapolis,Minnesota,44.9778,-93.265,Police hit press in neck and head with wooden ...,,"[baton, beat, journalist, strike]"
4,2020-05-26 07:00:00,[https://www.facebook.com/damicedsota.thespiri...,653ee2b0-156b-11eb-b242-f9e25f6eb571,Minneapolis,Minnesota,44.9778,-93.265,Man has his gun confiscated in an open carry s...,,"[abuse-of-power, arrest, protester]"
...,...,...,...,...,...,...,...,...,...,...
2504,2020-11-04 08:00:00,[https://twitter.com/MarcusKulik/status/132425...,bf4ee070-1fb8-11eb-bae4-95bdb3987ef2,Seattle,Washington,47.6062,-122.332,Police assault and hospitalize protester,,"[arrest, protester, punch, tackle]"
2505,2020-11-04 08:00:00,[https://twitter.com/onelung_/status/132420651...,bf46b270-1fb8-11eb-b018-5fba53929c9e,Seattle,Washington,47.6062,-122.332,Police arrest bystander in vehicle,,"[arrest, bike, bystander, property-destruction]"
2506,2020-11-04 08:00:00,[https://mobile.twitter.com/jovannithe1st/stat...,3baf70a0-1fb7-11eb-a425-3d2bf03262cb,Portland,Oregon,45.5051,-122.675,Protester tackled and arrested,,"[arrest, protester, tackle]"
2507,2020-11-08 00:00:00,[https://twitter.com/AdamnCostelloTV/status/13...,or-portland-409,Portland,Oregon,45.4928,-122.673,DHS agents arrest & tear gas protesters over t...,"Shortly after midnight on November 8th, DHS ag...","[arrest, journalist, less-lethal, pepper-ball,..."


## Natural Language Pre-Processing and Analytics

In [None]:
def remove_list(col):
    l = []
    rows = ""

    for row in col:
        for item in row:
            if item not in rows or len(rows) == 0:
                rows = rows + " " + str(item)

        l.append(rows)
        rows = []
        rows = ""
    return l

# Apply function to remove tags within a list
merged_dfs['words'] = remove_list(merged_dfs['tags'])

In [None]:
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_sm")
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)
# Update stop words with all non-police of force terms
stop_words = [
            "celebrity",
            "child",
            "ederly",
            "lgbtq+",
            "homeless",
            "journalist",
            "non-protest",
            "person-with-disability",
            "medic",
            "politician",
            "pregnant",
            "property-desctruction",
            " ",
            "bystander",
            "protester",
            "legal-observer",
            "hide-badge",
            'body-cam',
            "conceal",
            'elderly'
            ]
# Update stop words default list
stop = nlp.Defaults.stop_words.union(stop_words)

In [None]:
from tqdm import tqdm

tqdm.pandas()


def remove_stops(_list_):
    keywords = []


    for keyword in _list_:
        phrase = []
        words = keyword.split()
        for word in words:
            if word in stop:
                pass
            else: 
                phrase.append(word)
    
        phrase = ' '.join(phrase)
        if len(phrase) > 0:
            keywords.append(phrase)
    
    return keywords


# Apply function to use remove stop words and words that aren't indicative
# of police use of force
merged_dfs['cleaned_tags'] = merged_dfs['tags'].progress_apply(remove_stops)
merged_dfs.drop(labels=['words', 'tags'], axis=1, inplace=True)
merged_dfs.rename(columns={'cleaned_tags':'tags'}, inplace=True)
merged_dfs


100%|██████████| 2509/2509 [00:00<00:00, 155104.18it/s]


Unnamed: 0,date,links,id,city,state,lat,long,title,description,tags
0,2020-05-26 00:00:00,[https://www.facebook.com/damicedsota.thespiri...,mn-minneapolis-28,Minneapolis,Minnesota,44.9413,-93.2626,Man has his gun confiscated in an open carry s...,Man encounters police arresting people open ca...,"[abuse-of-power, arrest]"
1,2020-05-26 00:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",mn-minneapolis-21,Minneapolis,Minnesota,44.9479,-93.2349,Police hit press in neck and head with wooden ...,A group of cops start to approach a group of p...,"[baton, beat, strike]"
2,2020-05-26 00:00:00,[https://www.facebook.com/1462345700/posts/102...,mn-minneapolis-14,Minneapolis,Minnesota,44.9481,-93.237,Police shoot flashbang grenades into crowd,Police on the rooftop of the 3rd precinct fire...,"[less-lethal, rubber-bullet, stun-grenade, tea..."
3,2020-05-26 07:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",6537b170-156b-11eb-9a05-43449b932225,Minneapolis,Minnesota,44.9778,-93.265,Police hit press in neck and head with wooden ...,,"[baton, beat, strike]"
4,2020-05-26 07:00:00,[https://www.facebook.com/damicedsota.thespiri...,653ee2b0-156b-11eb-b242-f9e25f6eb571,Minneapolis,Minnesota,44.9778,-93.265,Man has his gun confiscated in an open carry s...,,"[abuse-of-power, arrest]"
...,...,...,...,...,...,...,...,...,...,...
2504,2020-11-04 08:00:00,[https://twitter.com/MarcusKulik/status/132425...,bf4ee070-1fb8-11eb-bae4-95bdb3987ef2,Seattle,Washington,47.6062,-122.332,Police assault and hospitalize protester,,"[arrest, punch, tackle]"
2505,2020-11-04 08:00:00,[https://twitter.com/onelung_/status/132420651...,bf46b270-1fb8-11eb-b018-5fba53929c9e,Seattle,Washington,47.6062,-122.332,Police arrest bystander in vehicle,,"[arrest, bike, property-destruction]"
2506,2020-11-04 08:00:00,[https://mobile.twitter.com/jovannithe1st/stat...,3baf70a0-1fb7-11eb-a425-3d2bf03262cb,Portland,Oregon,45.5051,-122.675,Protester tackled and arrested,,"[arrest, tackle]"
2507,2020-11-08 00:00:00,[https://twitter.com/AdamnCostelloTV/status/13...,or-portland-409,Portland,Oregon,45.4928,-122.673,DHS agents arrest & tear gas protesters over t...,"Shortly after midnight on November 8th, DHS ag...","[arrest, less-lethal, pepper-ball, projectile,..."


In [None]:
# Analyzing tokens
# Object from Base Python
from collections import Counter

# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()

# Update it based on a split of each of our documents
merged_dfs['tags'].apply(lambda x: word_counts.update(x))

# Print out the 20 most common words
word_counts.most_common(75)  # All of the words

[('less-lethal', 1371),
 ('arrest', 754),
 ('shove', 678),
 ('shoot', 611),
 ('tear-gas', 542),
 ('pepper-spray', 450),
 ('spray', 430),
 ('projectile', 399),
 ('push', 398),
 ('strike', 303),
 ('baton', 301),
 ('tackle', 227),
 ('beat', 196),
 ('property-destruction', 184),
 ('rubber-bullet', 182),
 ('stun-grenade', 178),
 ('abuse-of-power', 174),
 ('pepper-ball', 134),
 ('knee', 128),
 ('tear-gas-canister', 106),
 ('grab', 102),
 ('threaten', 98),
 ('punch', 94),
 ('throw', 88),
 ('bike', 68),
 ('vehicle', 68),
 ('inhumane-treatment', 62),
 ('zip-tie', 62),
 ('shield', 48),
 ('knee-on-neck', 40),
 ('kick', 34),
 ('explosive', 32),
 ('lrad', 30),
 ('mace', 30),
 ('drive', 26),
 ('bean-bag', 20),
 ('gas', 18),
 ('choke', 18),
 ('foam-bullet', 14),
 ('taser', 14),
 ('tase', 12),
 ('horse', 10),
 ('gun', 10),
 ('racial-profiling', 10),
 ('incitement', 6),
 ('sexual-assault', 6),
 ('wooden-bullet', 4),
 ('marking-round', 4),
 ('death', 4),
 ('live-round', 4),
 ('sponge-round', 2),
 ('pain

In [None]:
# NOTE: ALL CATEGORIES STRICTLY FOLLOW THE NATIONAL INJUSTICE OF JUSTICE USE-OF-CONTINUM DEFINITIONS
#for more information, visit https://nij.ojp.gov/topics/articles/use-force-continuum
VERBALIZATION = ['threaten', 'incitement'] 
EMPTY_HAND_SOFT = ['arrest', 'grab', 'zip-tie', ]
EMPTY_HAND_HARD = ['shove', 'push', 'strike', 'tackle', 'beat', 'knee', 'punch',
                   'throw', 'knee-on-neck', 'kick', 'choke', 'dog', 'headlock']
LESS_LETHAL_METHODS = ['less-lethal', 'tear-gas', 'pepper-spray', 'baton', 
                       'projectile', 'stun-grenade', 'pepper-ball', 
                       'tear-gas-canister', 'explosive', 'mace', 'lrad', 
                       'bean-bag', 'gas', 'foam-bullets', 'taser', 'tase', 
                       'wooden-bullet', 'rubber-bullet', 'marking-rounds', 
                       'paintball'] 
LETHAL_FORCE = ['shoot', 'throw', 'gun', 'death', 'live-round', ]
UNCATEGORIZED = ['property-destruction', 'abuse-of-power', 'bike', 
                 'inhumane-treatment', 'shield', 'vehicle', 'drive', 'horse', 
                 'racial-profiling', 'spray', 'sexual-assault', ]  
# UNCATEGORIZED are Potential Stop Words. Need to talk to team. 

In [None]:
# Need dummy columns to fill. Create a cleaner function to handle this problem. DJ.
merged_dfs['Verbalization'],merged_dfs['Empty_Hand_Soft'],merged_dfs['Empty_Hand_Hard'],merged_dfs['Less_Lethal_Methods'],merged_dfs['Lethal_Force'],merged_dfs['Uncategorized'] = merged_dfs['date'],merged_dfs['date'],merged_dfs['date'],merged_dfs['date'],merged_dfs['date'],merged_dfs['date']


merged_dfs  # Created dummy data filled with the date column

Unnamed: 0,date,links,id,city,state,lat,long,title,description,tags,Verbalization,Empty_Hand_Soft,Empty_Hand_Hard,Less_Lethal_Methods,Lethal_Force,Uncategorized
0,2020-05-26 00:00:00,[https://www.facebook.com/damicedsota.thespiri...,mn-minneapolis-28,Minneapolis,Minnesota,44.9413,-93.2626,Man has his gun confiscated in an open carry s...,Man encounters police arresting people open ca...,"[abuse-of-power, arrest]",2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00
1,2020-05-26 00:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",mn-minneapolis-21,Minneapolis,Minnesota,44.9479,-93.2349,Police hit press in neck and head with wooden ...,A group of cops start to approach a group of p...,"[baton, beat, strike]",2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00
2,2020-05-26 00:00:00,[https://www.facebook.com/1462345700/posts/102...,mn-minneapolis-14,Minneapolis,Minnesota,44.9481,-93.237,Police shoot flashbang grenades into crowd,Police on the rooftop of the 3rd precinct fire...,"[less-lethal, rubber-bullet, stun-grenade, tea...",2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00,2020-05-26 00:00:00
3,2020-05-26 07:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",6537b170-156b-11eb-9a05-43449b932225,Minneapolis,Minnesota,44.9778,-93.265,Police hit press in neck and head with wooden ...,,"[baton, beat, strike]",2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00
4,2020-05-26 07:00:00,[https://www.facebook.com/damicedsota.thespiri...,653ee2b0-156b-11eb-b242-f9e25f6eb571,Minneapolis,Minnesota,44.9778,-93.265,Man has his gun confiscated in an open carry s...,,"[abuse-of-power, arrest]",2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00,2020-05-26 07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2504,2020-11-04 08:00:00,[https://twitter.com/MarcusKulik/status/132425...,bf4ee070-1fb8-11eb-bae4-95bdb3987ef2,Seattle,Washington,47.6062,-122.332,Police assault and hospitalize protester,,"[arrest, punch, tackle]",2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00
2505,2020-11-04 08:00:00,[https://twitter.com/onelung_/status/132420651...,bf46b270-1fb8-11eb-b018-5fba53929c9e,Seattle,Washington,47.6062,-122.332,Police arrest bystander in vehicle,,"[arrest, bike, property-destruction]",2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00
2506,2020-11-04 08:00:00,[https://mobile.twitter.com/jovannithe1st/stat...,3baf70a0-1fb7-11eb-a425-3d2bf03262cb,Portland,Oregon,45.5051,-122.675,Protester tackled and arrested,,"[arrest, tackle]",2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00,2020-11-04 08:00:00
2507,2020-11-08 00:00:00,[https://twitter.com/AdamnCostelloTV/status/13...,or-portland-409,Portland,Oregon,45.4928,-122.673,DHS agents arrest & tear gas protesters over t...,"Shortly after midnight on November 8th, DHS ag...","[arrest, less-lethal, pepper-ball, projectile,...",2020-11-08 00:00:00,2020-11-08 00:00:00,2020-11-08 00:00:00,2020-11-08 00:00:00,2020-11-08 00:00:00,2020-11-08 00:00:00


In [None]:
def Searchfortarget(list, targetl):
    for target in targetl:
        res = list.index(target) if target in list else -1 # finds index of target
        if res == -1:
            return 0 # if target is not in list returns -1
        else:
            return 1 # if the target exist it returns


def UseofForceContinuumtest(col):
    for i, row in enumerate(col):
        merged_dfs['Verbalization'].iloc[i], merged_dfs['Empty_Hand_Soft'].iloc[i], merged_dfs['Empty_Hand_Hard'].iloc[i], merged_dfs['Less_Lethal_Methods'].iloc[i],merged_dfs['Lethal_Force'].iloc[i],merged_dfs['Uncategorized'].iloc[i] = Searchfortarget(VERBALIZATION, row),Searchfortarget(EMPTY_HAND_SOFT, row), Searchfortarget(EMPTY_HAND_HARD, row),Searchfortarget(LESS_LETHAL_METHODS, row),Searchfortarget(LETHAL_FORCE, row), Searchfortarget(UNCATEGORIZED, row)


"""Alternatively, this (below) is what is happening under the hood"""
# def UseofForceContinuum(col):

#     for i, row in enumerate(col):
#         # print("--------------")
#         # print(row, i)

#         merged_dfs['Verbalization'].iloc[i] = Searchfortarget(VERBALIZATION, row)
#         merged_dfs['Empty_Hand_Soft'].iloc[i] = Searchfortarget(EMPTY_HAND_SOFT, row)
#         merged_dfs['Empty_Hand_Hard'].iloc[i] = Searchfortarget(EMPTY_HAND_HARD, row)
#         merged_dfs['Less_Lethal_Methods'].iloc[i] = Searchfortarget(LESS_LETHAL_METHODS, row)
#         merged_dfs['Lethal_Force'].iloc[i] = Searchfortarget(LETHAL_FORCE, row)
#         merged_dfs['Uncategorized'].iloc[i] = Searchfortarget(UNCATEGORIZED, row)
#         # return merged_dfs

# UseofForceContinuum(merged_dfs['cleaned_words'])

# Apply function to the cleaned_tags columns
UseofForceContinuumtest(merged_dfs['tags'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


## The newly added columns are objects instead of integers.*****

In [None]:
# Saved the data in on .csv file for all sources.

In [None]:
# Create a copy of the data
cleaned_df = merged_dfs.copy()
cleaned_df
# Saved the data in on .csv file for all sources.
# cleaned_df.to_csv(f'Labs28_AllSources_Data{last_updated}.csv', sep="|",index=False)  # Uncomment to save.

#* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 

#Proceed to Labs28_D_Duplicate_LinkExperiment.ipynb

#* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 

In [None]:
cleaned_df

Unnamed: 0,date,links,id,city,state,lat,long,title,description,tags,Verbalization,Empty_Hand_Soft,Empty_Hand_Hard,Less_Lethal_Methods,Lethal_Force,Uncategorized
0,2020-05-26 00:00:00,[https://www.facebook.com/damicedsota.thespiri...,mn-minneapolis-28,Minneapolis,Minnesota,44.9413,-93.2626,Man has his gun confiscated in an open carry s...,Man encounters police arresting people open ca...,"[abuse-of-power, arrest]",0,0,0,0,0,1
1,2020-05-26 00:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",mn-minneapolis-21,Minneapolis,Minnesota,44.9479,-93.2349,Police hit press in neck and head with wooden ...,A group of cops start to approach a group of p...,"[baton, beat, strike]",0,0,0,1,0,0
2,2020-05-26 00:00:00,[https://www.facebook.com/1462345700/posts/102...,mn-minneapolis-14,Minneapolis,Minnesota,44.9481,-93.237,Police shoot flashbang grenades into crowd,Police on the rooftop of the 3rd precinct fire...,"[less-lethal, rubber-bullet, stun-grenade, tea...",0,0,0,1,0,0
3,2020-05-26 07:00:00,"[https://youtu.be/XAa5xb6JitI?t=5982, https://...",6537b170-156b-11eb-9a05-43449b932225,Minneapolis,Minnesota,44.9778,-93.265,Police hit press in neck and head with wooden ...,,"[baton, beat, strike]",0,0,0,1,0,0
4,2020-05-26 07:00:00,[https://www.facebook.com/damicedsota.thespiri...,653ee2b0-156b-11eb-b242-f9e25f6eb571,Minneapolis,Minnesota,44.9778,-93.265,Man has his gun confiscated in an open carry s...,,"[abuse-of-power, arrest]",0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2504,2020-11-04 08:00:00,[https://twitter.com/MarcusKulik/status/132425...,bf4ee070-1fb8-11eb-bae4-95bdb3987ef2,Seattle,Washington,47.6062,-122.332,Police assault and hospitalize protester,,"[arrest, punch, tackle]",0,1,0,0,0,0
2505,2020-11-04 08:00:00,[https://twitter.com/onelung_/status/132420651...,bf46b270-1fb8-11eb-b018-5fba53929c9e,Seattle,Washington,47.6062,-122.332,Police arrest bystander in vehicle,,"[arrest, bike, property-destruction]",0,1,0,0,0,0
2506,2020-11-04 08:00:00,[https://mobile.twitter.com/jovannithe1st/stat...,3baf70a0-1fb7-11eb-a425-3d2bf03262cb,Portland,Oregon,45.5051,-122.675,Protester tackled and arrested,,"[arrest, tackle]",0,1,0,0,0,0
2507,2020-11-08 00:00:00,[https://twitter.com/AdamnCostelloTV/status/13...,or-portland-409,Portland,Oregon,45.4928,-122.673,DHS agents arrest & tear gas protesters over t...,"Shortly after midnight on November 8th, DHS ag...","[arrest, less-lethal, pepper-ball, projectile,...",0,1,0,0,0,0
