# Preparing the data for analysis

## Stanford Open Policing Project dataset 

In [1]:
# Import a dataset of traffic stops in Rhode Island
import pandas as pd

ri = pd.read_csv('/work/data_science_notes/ Analyzing police activity with pandas/data/police.csv')

In [2]:
ri.head()

Unnamed: 0,state,stop_date,stop_time,county_name,driver_gender,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop,district
0,RI,2005-01-04,12:55,,M,White,Equipment/Inspection Violation,Equipment,False,,Citation,False,0-15 Min,False,Zone X4
1,RI,2005-01-23,23:15,,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone K3
2,RI,2005-02-17,04:15,,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X4
3,RI,2005-02-20,17:15,,M,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False,Zone X1
4,RI,2005-02-24,01:20,,F,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X3


In [3]:
ri.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91741 entries, 0 to 91740
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   state               91741 non-null  object 
 1   stop_date           91741 non-null  object 
 2   stop_time           91741 non-null  object 
 3   county_name         0 non-null      float64
 4   driver_gender       86536 non-null  object 
 5   driver_race         86539 non-null  object 
 6   violation_raw       86539 non-null  object 
 7   violation           86539 non-null  object 
 8   search_conducted    91741 non-null  bool   
 9   search_type         3307 non-null   object 
 10  stop_outcome        86539 non-null  object 
 11  is_arrested         86539 non-null  object 
 12  stop_duration       86539 non-null  object 
 13  drugs_related_stop  91741 non-null  bool   
 14  district            91741 non-null  object 
dtypes: bool(2), float64(1), object(12)
memory usage: 9.3+

In [4]:
ri.isnull().sum()

state                     0
stop_date                 0
stop_time                 0
county_name           91741
driver_gender          5205
driver_race            5202
violation_raw          5202
violation              5202
search_conducted          0
search_type           88434
stop_outcome           5202
is_arrested            5202
stop_duration          5202
drugs_related_stop        0
district                  0
dtype: int64

In [5]:
# Drop the county_name column because it only contains missing values.
# Drop the state column because all of the traffic stops took place in one state (Rhode Island).

ri.drop(['state', 'county_name'], axis='columns', inplace=True)
ri.head()

Unnamed: 0,stop_date,stop_time,driver_gender,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop,district
0,2005-01-04,12:55,M,White,Equipment/Inspection Violation,Equipment,False,,Citation,False,0-15 Min,False,Zone X4
1,2005-01-23,23:15,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone K3
2,2005-02-17,04:15,M,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X4
3,2005-02-20,17:15,M,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False,Zone X1
4,2005-02-24,01:20,F,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False,Zone X3


In [6]:
# During this course, the driver_gender column will be critical. 
# Because only a small fraction of rows are missing we'll drop them.
ri.dropna(subset=['driver_gender'], inplace=True)
ri.isnull().sum()

stop_date                 0
stop_time                 0
driver_gender             0
driver_race               0
violation_raw             0
violation                 0
search_conducted          0
search_type           83229
stop_outcome              0
is_arrested               0
stop_duration             0
drugs_related_stop        0
district                  0
dtype: int64

## Using proper data types

In [7]:
# Explore ri to determine which column's data type should be changed.
ri.dtypes

stop_date             object
stop_time             object
driver_gender         object
driver_race           object
violation_raw         object
violation             object
search_conducted        bool
search_type           object
stop_outcome          object
is_arrested           object
stop_duration         object
drugs_related_stop      bool
district              object
dtype: object

In [8]:
ri['is_arrested'].head()

0    False
1    False
2    False
3     True
4    False
Name: is_arrested, dtype: object

In [9]:
# Fixing the is_arrested column data type
ri['is_arrested'] == ri.is_arrested.astype('bool')

0        True
1        True
2        True
3        True
4        True
         ... 
91736    True
91737    True
91738    True
91739    True
91740    True
Name: is_arrested, Length: 86536, dtype: bool

## Creating a DatetimeIndex

In [10]:
# The date and time of each traffic stop are stored in separate object columns: stop_date and stop_time.
# Combine these two columns into a single column, and then convert it to datetime format. 

display(ri['stop_date'].head())
ri['stop_time'].head()

0    2005-01-04
1    2005-01-23
2    2005-02-17
3    2005-02-20
4    2005-02-24
Name: stop_date, dtype: object

0    12:55
1    23:15
2    04:15
3    17:15
4    01:20
Name: stop_time, dtype: object

In [11]:
ri['stop_datetime'] = ri['stop_date'].str.cat(' ' + ri['stop_time'])
ri['stop_datetime'].head()

0    2005-01-04 12:55
1    2005-01-23 23:15
2    2005-02-17 04:15
3    2005-02-20 17:15
4    2005-02-24 01:20
Name: stop_datetime, dtype: object

In [12]:
ri['stop_datetime'] = pd.to_datetime(ri['stop_datetime'])
ri.drop(['stop_date', 'stop_time'], axis='columns', inplace=True)

dtype('<M8[ns]')

In [14]:
ri['stop_datetime'].head()

0   2005-01-04 12:55:00
1   2005-01-23 23:15:00
2   2005-02-17 04:15:00
3   2005-02-20 17:15:00
4   2005-02-24 01:20:00
Name: stop_datetime, dtype: datetime64[ns]

In [17]:
# Set the stop_datetime column as the DataFrame's index. 
ri.set_index('stop_datetime', inplace=True)

In [18]:
ri.index

DatetimeIndex(['2005-01-04 12:55:00', '2005-01-23 23:15:00',
               '2005-02-17 04:15:00', '2005-02-20 17:15:00',
               '2005-02-24 01:20:00', '2005-03-14 10:00:00',
               '2005-03-29 21:55:00', '2005-04-04 21:25:00',
               '2005-07-14 11:20:00', '2005-07-14 19:55:00',
               ...
               '2015-12-31 13:23:00', '2015-12-31 18:59:00',
               '2015-12-31 19:13:00', '2015-12-31 20:20:00',
               '2015-12-31 20:50:00', '2015-12-31 21:21:00',
               '2015-12-31 21:59:00', '2015-12-31 22:04:00',
               '2015-12-31 22:09:00', '2015-12-31 22:47:00'],
              dtype='datetime64[ns]', name='stop_datetime', length=86536, freq=None)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=dc001eea-22fe-4a27-852d-7fbece520334' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>