# Preparing the data for analysis

You'll be working with a dataset of traffic stops by police officers that was collected by the Stanford Open Policing Project.

You'll be focusing on data from the state of Rhode Island.

## Stanford Open Policing Project dataset 

In [1]:
# Read the file police.csv into a DataFrame named ri
# Examine the first 5 rows of the DataFrame

import pandas as pd
ri = pd.read_csv('/work/data_science_notes/ 23. Analyzing police activity with pandas/data/police.csv')
ri.head()

Unnamed: 0,state,stop_date,stop_time,county_name,driver_gender,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop,district
0,RI,2005-01-04,12:55,,M,White,Equipment/Inspection Violation,Equipment,False,,Citation,False,0-15 Min,False,Zone X4
1,RI,2005-01-23,23:15,,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone K3
2,RI,2005-02-17,04:15,,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X4
3,RI,2005-02-20,17:15,,M,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False,Zone X1
4,RI,2005-02-24,01:20,,F,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X3


In [2]:
# Count the number of missing values in each column

ri.isna().sum()

state                     0
stop_date                 0
stop_time                 0
county_name           91741
driver_gender          5205
driver_race            5202
violation_raw          5202
violation              5202
search_conducted          0
search_type           88434
stop_outcome           5202
is_arrested            5202
stop_duration          5202
drugs_related_stop        0
district                  0
dtype: int64

In [3]:
# Find out the number of rows and columns of ri

ri.shape

(91741, 15)

In [4]:
# Drop both the county_name and state columns
# All county_name values are missing
# All ri vales are from Rhode Island

ri.drop(['county_name', 'state'], axis='columns', inplace=True)

# ERROR! Remeber to add the axis argument

In [5]:
# Verify that there are now two fewer columns
ri.shape

(91741, 13)

In [6]:
# Count the number of missing values in each column
ri.isna().sum()

stop_date                 0
stop_time                 0
driver_gender          5205
driver_race            5202
violation_raw          5202
violation              5202
search_conducted          0
search_type           88434
stop_outcome           5202
is_arrested            5202
stop_duration          5202
drugs_related_stop        0
district                  0
dtype: int64

In [7]:
# Drop all rows that are missing driver_gender
# It'll be an important in our analysis
ri.dropna(subset=['driver_gender'], inplace=True)

# ERROR! You need to specify the columns to drop on by the subset argument

In [8]:
# Verify that none of the remaining rows are missing driver_gender
ri.isna().sum()

stop_date                 0
stop_time                 0
driver_gender             0
driver_race               0
violation_raw             0
violation                 0
search_conducted          0
search_type           83229
stop_outcome              0
is_arrested               0
stop_duration             0
drugs_related_stop        0
district                  0
dtype: int64

In [9]:
# Find how many rows and columns remain
ri.shape

(86536, 13)

## Using proper data types

In [10]:
# Examine the head of the is_arrested column to verify that it contains True and False values and to check the column's data type
ri['is_arrested'].head()

0    False
1    False
2    False
3     True
4    False
Name: is_arrested, dtype: object

In [11]:
# Convert is_arrested to a bool column
ri['is_arrested']= ri.is_arrested.astype('bool')

In [12]:
# Check the new data type of is_arrested to confirm that it is now a bool column
ri.is_arrested.dtypes

dtype('bool')

## Creating a DatetimeIndex

In [13]:
# Use a string method to concatenate stop_date and stop_time (separated by a space), and store the result in combined
combined = ri.stop_date.str.cat(' ' + ri.stop_time)

In [14]:
# Convert combined to datetime format, and store the result in a new column named stop_datetime
ri['stop_datetime'] = pd.to_datetime(combined)

In [15]:
# Examine the DataFrame .dtypes to confirm that stop_datetime is a datetime column
ri.dtypes

stop_date                     object
stop_time                     object
driver_gender                 object
driver_race                   object
violation_raw                 object
violation                     object
search_conducted                bool
search_type                   object
stop_outcome                  object
is_arrested                     bool
stop_duration                 object
drugs_related_stop              bool
district                      object
stop_datetime         datetime64[ns]
dtype: object

In [16]:
# Set stop_datetime as the DataFrame index
ri = ri.set_index('stop_datetime')

In [17]:
# Examine the index to verify that it is a DatetimeIndex
ri.index

DatetimeIndex(['2005-01-04 12:55:00', '2005-01-23 23:15:00',
               '2005-02-17 04:15:00', '2005-02-20 17:15:00',
               '2005-02-24 01:20:00', '2005-03-14 10:00:00',
               '2005-03-29 21:55:00', '2005-04-04 21:25:00',
               '2005-07-14 11:20:00', '2005-07-14 19:55:00',
               ...
               '2015-12-31 13:23:00', '2015-12-31 18:59:00',
               '2015-12-31 19:13:00', '2015-12-31 20:20:00',
               '2015-12-31 20:50:00', '2015-12-31 21:21:00',
               '2015-12-31 21:59:00', '2015-12-31 22:04:00',
               '2015-12-31 22:09:00', '2015-12-31 22:47:00'],
              dtype='datetime64[ns]', name='stop_datetime', length=86536, freq=None)

In [18]:
# Examine the DataFrame columns to confirm that stop_datetime is no longer one of the columns
ri.columns

Index(['stop_date', 'stop_time', 'driver_gender', 'driver_race',
       'violation_raw', 'violation', 'search_conducted', 'search_type',
       'stop_outcome', 'is_arrested', 'stop_duration', 'drugs_related_stop',
       'district'],
      dtype='object')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=dc001eea-22fe-4a27-852d-7fbece520334' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>