In [1]:
# import libraries
import pandas as pd    # for analysis of data
import numpy as np    # for numerical operations

In [2]:
police = pd.read_csv(r"C:\Users\ashis\Downloads\Police_Data.csv")   # # read csv file using pandas

In [3]:
police.shape     # shows rows and columns

(65535, 15)

In [4]:
police.index   # get satar and stop index

RangeIndex(start=0, stop=65535, step=1)

In [5]:
police.columns   # get columns

Index(['stop_date', 'stop_time', 'country_name', 'driver_gender',
       'driver_age_raw', 'driver_age', 'driver_race', 'violation_raw',
       'violation', 'search_conducted', 'search_type', 'stop_outcome',
       'is_arrested', 'stop_duration', 'drugs_related_stop'],
      dtype='object')

In [6]:
police.dtypes   # show datatypes

stop_date              object
stop_time              object
country_name          float64
driver_gender          object
driver_age_raw        float64
driver_age            float64
driver_race            object
violation_raw          object
violation              object
search_conducted         bool
search_type            object
stop_outcome           object
is_arrested            object
stop_duration          object
drugs_related_stop       bool
dtype: object

In [7]:
police.nunique()   # show number of unique values in data

stop_date             2651
stop_time             1432
country_name             0
driver_gender            2
driver_age_raw          93
driver_age              73
driver_race              5
violation_raw           12
violation                6
search_conducted         2
search_type             23
stop_outcome             6
is_arrested              2
stop_duration            4
drugs_related_stop       2
dtype: int64

In [8]:
police.count()   # show count of values in variables

stop_date             65535
stop_time             65535
country_name              0
driver_gender         61474
driver_age_raw        61481
driver_age            61228
driver_race           61475
violation_raw         61475
violation             61475
search_conducted      65535
search_type            2479
stop_outcome          61475
is_arrested           61475
stop_duration         61475
drugs_related_stop    65535
dtype: int64

### Q1. Remove the column that only contains missing values

In [9]:
police.isnull().sum()    # show number of missing values in the data

stop_date                 0
stop_time                 0
country_name          65535
driver_gender          4061
driver_age_raw         4054
driver_age             4307
driver_race            4060
violation_raw          4060
violation              4060
search_conducted          0
search_type           63056
stop_outcome           4060
is_arrested            4060
stop_duration          4060
drugs_related_stop        0
dtype: int64

In [10]:
police.drop(columns = 'country_name', inplace = True)   # drop  'country_name' column

### Q2. For Speeding, were Men or Women stopped more often?   ( based on filtering or value_counts)

In [11]:
police.head(2)  # show 2 rows

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [12]:
police[police.violation == 'Speeding'].driver_gender.value_counts()   # filter with speeding then use value counts

M    25517
F    11686
Name: driver_gender, dtype: int64

Men count is more so men stopped more.

### Q3. Does gender affect who gets searched during a stop?  (Groupby)

In [13]:
police.groupby('driver_gender').search_conducted.sum()  # use groupby then use sum()

driver_gender
F     366
M    2113
Name: search_conducted, dtype: int64

In [14]:
police.search_conducted.value_counts()   # count values in variable

False    63056
True      2479
Name: search_conducted, dtype: int64

Searched conducted gets True 2479 out of which 2113 for men and 366 for female. 

### Q4. What is mean stop_duration ?             

In [15]:
police.head(5)  # 1st 5 rows

Unnamed: 0,stop_date,stop_time,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [16]:
police.stop_duration.value_counts()   # get value count of elements for variable 

0-15 Min     47379
16-30 Min    11448
30+ Min       2647
2                1
Name: stop_duration, dtype: int64

In [17]:
police['stop_duration'] = police['stop_duration'].map({'0-15 Min': 7.5, ' 16-30 Min': 24, '30+ Min': 45}) 
# convert stop duration column from object to integer by using map function 

In [18]:
police['stop_duration'].mean()   # fnding mean()

9.484218206532603

### Q5. Compare the age distributions for each violations 

In [19]:
police.groupby('violation').driver_age.describe()    # use groupby then use describe function to get age distributions

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
violation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Equipment,6507.0,31.682957,11.380671,16.0,23.0,28.0,39.0,81.0
Moving violation,11876.0,36.736443,13.25835,15.0,25.0,35.0,47.0,86.0
Other,3477.0,40.362381,12.754423,16.0,30.0,41.0,50.0,86.0
Registration/plates,2240.0,32.656696,11.15078,16.0,24.0,30.0,40.0,74.0
Seat belt,3.0,30.333333,10.214369,23.0,24.5,26.0,34.0,42.0
Speeding,37120.0,33.262581,12.615781,15.0,23.0,30.0,42.0,88.0
