## Police Violence Data 

notebook for adding geolocation data to the Police Brutality 2020 database and some basic data exploration and visualization 

In [1]:
# imports
import sys
import pandas as pd
import numpy as np
import re
from google.colab import files

ModuleNotFoundError: No module named 'google'

### PB 2020 Data

[police violence incident repo](https://github.com/2020PB/police-brutality)


In [2]:
# police brutality data
pb_csv = pd.read_csv('https://raw.githubusercontent.com/2020PB/police-brutality/data_build/all-locations.csv')
pb_df = pd.DataFrame(pb_csv)

In [3]:
print(pb_df.shape)
pb_df.info()

(1116, 29)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1116 entries, 0 to 1115
Data columns (total 29 columns):
state        1116 non-null object
edit_at      1116 non-null object
city         1115 non-null object
name         1116 non-null object
date         1112 non-null object
date_text    1116 non-null object
id           1116 non-null object
Link 1       1116 non-null object
Link 2       651 non-null object
Link 3       376 non-null object
Link 4       224 non-null object
Link 5       140 non-null object
Link 6       97 non-null object
Link 7       62 non-null object
Link 8       51 non-null object
Link 9       33 non-null object
Link 10      28 non-null object
Link 11      24 non-null object
Link 12      19 non-null object
Link 13      14 non-null object
Link 14      12 non-null object
Link 15      10 non-null object
Link 16      8 non-null object
Link 17      7 non-null object
Link 18      7 non-null object
Link 19      7 non-null object
Link 20      6 non-null object
Lin

In [4]:
def clean_pb2020(df):
    '''
    function that takes PB2020 api data
    processes it according to our project needs
    outputs a cleaned df
    '''
    # rename columns (location columns to loc standard and description column to 'text')
    df.rename(columns={'state': 'STATE_NAME',
                       'city': 'CITY', 'name': 'text'}, inplace=True)
    # drop redundant date column
    df = df.drop('date_text', axis=1)
    # change substandard city and state names
    df['CITY'] = df['CITY'].str.replace('DC', 'Washington', case=True)
    df['STATE_NAME'] = df['STATE_NAME'].str.replace(
        'Washington DC', 'District of Columbia', case=False)
    df['CITY'] = df['CITY'].str.replace(
        'Hungtington Beach', 'Huntington Beach', case=True)
    # fix id to match city name
    df['id'] = df['id'].replace({'-dc': '-washington'}, regex=True)
    df['id'] = df['id'].replace(
        {'-hungtingtonbeach': '-huntingtonbeach'}, regex=True)
    df['id'] = df['id'].replace({'-costa-mesa': '-costamesa'}, regex=True)
    df['id'] = df['id'].replace({'-newyorkcity': '-newyork'}, regex=True)
    # drop NaNs
    df.dropna(subset=['CITY', 'date'], inplace=True)
    # put date column in datetime
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
    # remove leading and trailing whitespace from columns
    df['CITY'] = df['CITY'].str.strip()
    df['STATE_NAME'] = df['STATE_NAME'].str.strip()
    # create column for state abbreviations
    df['state_code'] = df['id'].str.split('-').str[0]
    # ensure state code column is str and capitalized
    df['state_code'] = df['state_code'].astype(str).str.upper()
    # put description column into str and convert text to lowercase
    df['text'] = df['text'].astype(str).str.lower()
    # regex for nlp:
    # remove backslash and apostrophe
    df['text'] = df['text'].str.replace(r'\'', r'')
    # remove anything that isn't in a-z
    df['CITY'] = df['CITY'].str.replace(r'[^a-zA-Z]', r' ')
    df['text'] = df['text'].str.replace(r'[^a-zA-Z]', r' ')

    return df


In [5]:
pb_df = clean_pb2020(pb_df)

In [6]:
print(pb_df.shape)
pb_df.isnull().sum()

(1111, 29)


STATE_NAME       0
edit_at          0
CITY             0
text             0
date             0
id               0
Link 1           0
Link 2         461
Link 3         735
Link 4         887
Link 5         971
Link 6        1014
Link 7        1049
Link 8        1060
Link 9        1078
Link 10       1083
Link 11       1087
Link 12       1092
Link 13       1097
Link 14       1099
Link 15       1101
Link 16       1103
Link 17       1104
Link 18       1104
Link 19       1104
Link 20       1105
Link 21       1108
Link 22       1110
state_code       0
dtype: int64

### Location Data

[US geolocations repo](https://github.com/kelvins/US-Cities-Database)

In [7]:
# location data
loc_csv = pd.read_csv('https://raw.githubusercontent.com/kelvins/US-Cities-Database/master/csv/us_cities.csv')
loc_df = pd.DataFrame(loc_csv)

HTTPError: HTTP Error 404: Not Found

In [8]:
print(loc_df.shape)
loc_df.info()

NameError: name 'loc_df' is not defined

In [9]:
loc_df.isna().sum()

NameError: name 'loc_df' is not defined

In [10]:
loc_df.dtypes

NameError: name 'loc_df' is not defined

In [11]:
def clean_loc(df):
  # drop redundant id column in loc_df
  df = df.drop('ID', axis=1)
  # drop rows with the same city and state but different counties
  df = df.drop_duplicates(subset=['STATE_CODE','CITY'], keep='first')
  # add missing cities
  Ferguson = {'STATE_CODE':'MO' ,'STATE_NAME':'Missouri', 'CITY':'Ferguson','COUNTY':'St. Louis','LATITUDE':38.744167,'LONGITUDE':-90.305278}
  DC = {'STATE_CODE':'DC' ,'STATE_NAME':'District of Columbia', 'CITY':'Washington','COUNTY':'St. Louis','LATITUDE':38.912217,'LONGITUDE':-77.017691}
  df = df.append(Ferguson, ignore_index=True)
  df = df.append(DC, ignore_index=True)
  return df

In [12]:
loc_df = clean_loc(loc_df)

NameError: name 'loc_df' is not defined

In [13]:
print(loc_df.shape)
loc_df.head()

NameError: name 'loc_df' is not defined

In [14]:
print(loc_df.shape)
loc_df.head()

NameError: name 'loc_df' is not defined

### Merge and Save

In [15]:
# merge our two dfs
# incident_df = pd.merge(pb_df, loc_df, how='left')
incident_df = pb_df.merge(loc_df, how='inner')

NameError: name 'loc_df' is not defined

In [16]:
print(incident_df.shape)
incident_df.tail()

NameError: name 'incident_df' is not defined

In [17]:
# double check for NaNs
incident_df.isnull().sum()

NameError: name 'incident_df' is not defined

In [18]:
# saving the dataframe 
incident_df.to_csv('pv_incidents.csv')
files.download('pv_incidents.csv')

NameError: name 'incident_df' is not defined

### Data Exploration

In [19]:
import pandas_profiling

ModuleNotFoundError: No module named 'pandas_profiling'

In [20]:
pip uninstall -y pandas_profiling

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

Collecting https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
  Downloading https://github.com/pandas-profiling/pandas-profiling/archive/master.zip (34.6 MB)
[K     |████████████████████████████████| 34.6 MB 3.8 MB/s 
Collecting pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3
  Downloading pandas-1.1.2-cp37-cp37m-macosx_10_9_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 918 kB/s 
[?25hCollecting matplotlib>=3.2.0
  Downloading matplotlib-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 14.5 MB/s 
[?25hCollecting confuse>=1.0.0
  Downloading confuse-1.3.0-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.7 MB/s 
[?25hCollecting jinja2>=2.11.1
  Downloading Jinja2-2.11.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 16.3 MB/s 
[?25hCollecting visions[type_image_path]==0.5.0
  Downloading visions-0.5.0-py3-none-any.whl (64 kB)

In [22]:
# Check Pandas Profiling version
from pandas_profiling import ProfileReport

pandas_profiling.__version__

ValueError: variadic positional parameters cannot have default values

In [23]:
profile = ProfileReport(incident_df)
profile.to_notebook_iframe()

NameError: name 'ProfileReport' is not defined