# Police Traffic Stop Analysis

This notebook analyzes data from police traffic stops, including:
- Stop reasons (violations)
- Driver demographics
- Search conducted status
- Stop duration

In [16]:
import pandas as pd
import numpy as np

In [17]:
# Load the dataset with error handling
try:
    df = pd.read_csv("Police_Dataset.csv")
    print(f"Dataset loaded successfully with {df.shape[0]} stops recorded")
except FileNotFoundError:
    print("Error: File not found. Please check the file path.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {str(e)}")

Dataset loaded successfully with 65535 stops recorded


## Initial Data Exploration

In [18]:
print("\n=== First 5 Rows ===")
display(df.head())


=== First 5 Rows ===


Unnamed: 0,stop_date,stop_time,country_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [19]:
print("\n=== Dataset Shape ===")
print(f"Total stops: {df.shape[0]}")
print(f"Features per stop: {df.shape[1]}")


=== Dataset Shape ===
Total stops: 65535
Features per stop: 15


In [20]:
print("\n=== Missing Values Summary ===")
print(df.isnull().sum())


=== Missing Values Summary ===
stop_date                 0
stop_time                 0
country_name          65535
driver_gender          4061
driver_age_raw         4054
driver_age             4307
driver_race            4060
violation_raw          4060
violation              4060
search_conducted          0
search_type           63056
stop_outcome           4060
is_arrested            4060
stop_duration          4060
drugs_related_stop        0
dtype: int64


## Data Cleaning

In [21]:
# Create copy to avoid SettingWithCopyWarning
df_clean = df.copy()

In [22]:
# Remove columns with all missing values
cols_to_drop = [col for col in df_clean.columns if df_clean[col].isnull().all()]
if cols_to_drop:
    print(f"\nDropping columns with all missing values: {cols_to_drop}")
    df_clean.drop(columns=cols_to_drop, inplace=True)
else:
    print("\nNo columns with all missing values found")


Dropping columns with all missing values: ['country_name']


## Data Analysis

In [23]:
# 1. Speeding violations by gender
print("\n=== Speeding Violations by Gender ===")
speeding = df[df['violation'] == 'Speeding']
gender_counts = speeding['driver_gender'].value_counts()
print(f"Speeding stops - Male: {gender_counts.get('M', 0)}, Female: {gender_counts.get('F', 0)}")


=== Speeding Violations by Gender ===
Speeding stops - Male: 25517, Female: 11686


In [24]:
# 2. Search rates by gender
print("\n=== Search Conducted by Gender ===")
search_rates = df.groupby('driver_gender')['search_conducted'].mean()
print("Search rates:")
print(f"Male: {search_rates.get('M', 0):.2%}")
print(f"Female: {search_rates.get('F', 0):.2%}")


=== Search Conducted by Gender ===
Search rates:
Male: 4.68%
Female: 2.24%


In [25]:
# 3. Stop duration analysis
print("\n=== Stop Duration Analysis ===")
# Convert stop durations to minutes
duration_map = {
    '0-15 Min': 7.5,
    '16-30 Min': 24,
    '30+ Min': 45}
df['stop_minutes'] = df['stop_duration'].map(duration_map)
print(f"Average stop duration: {df['stop_minutes'].mean():.1f} minutes")


=== Stop Duration Analysis ===
Average stop duration: 12.2 minutes


In [26]:
# 4. Age distribution by violation type
print("\n=== Age Distribution by Violation Type ===")
age_stats = df.groupby('violation')['driver_age'].describe()
display(age_stats)


=== Age Distribution by Violation Type ===


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
violation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Equipment,6507.0,31.682957,11.380671,16.0,23.0,28.0,39.0,81.0
Moving violation,11876.0,36.736443,13.25835,15.0,25.0,35.0,47.0,86.0
Other,3477.0,40.362381,12.754423,16.0,30.0,41.0,50.0,86.0
Registration/plates,2240.0,32.656696,11.15078,16.0,24.0,30.0,40.0,74.0
Seat belt,3.0,30.333333,10.214369,23.0,24.5,26.0,34.0,42.0
Speeding,37120.0,33.262581,12.615781,15.0,23.0,30.0,42.0,88.0


In [27]:
# Additional analysis: Violation distribution
print("\n=== Violation Distribution ===")
display(df['violation'].value_counts(normalize=True))


=== Violation Distribution ===


violation
Speeding               0.605189
Moving violation       0.193998
Equipment              0.105994
Other                  0.058284
Registration/plates    0.036486
Seat belt              0.000049
Name: proportion, dtype: float64