## Preparing the data for analysis

In [1]:
import pandas as pd

#Traffic stops in Rhode Island
ri = pd.read_csv('https://assets.datacamp.com/production/repositories/1497/datasets/62bd9feef451860db02d26553613a299721882e8/police.csv')

#Locating missing values and compare to shape of df
ri.isnull().sum()
ri.shape

#Dropping a column since there's no values
ri.drop('county_name', axis = 'columns', inplace=True)

#Dropping rows with no stop date or stop time
ri.dropna(subset=['stop_date','stop_time'],inplace=True)

#Checking and fixing data types
ri.dtypes

# Examine the head of the 'is_arrested' column
print(ri.is_arrested.head())

# Change the data type of 'is_arrested' to 'bool'
ri['is_arrested'] = ri.is_arrested.astype('bool')

# Check the data type of 'is_arrested' 
print(ri.is_arrested.head())

#Creating a DatetimeIndex
#Combining Stop_date and Stop_time (Object columns)
combined = ri.stop_date.str.cat(ri.stop_time, sep=' ')
ri['stop_datetime'] = pd.to_datetime(combined)
ri.set_index('stop_datetime', inplace=True)
ri.index #Verifying Index
ri.columns #Verify Columns


0    False
1    False
2    False
3     True
4    False
Name: is_arrested, dtype: object
0    False
1    False
2    False
3     True
4    False
Name: is_arrested, dtype: bool


Index(['state', 'stop_date', 'stop_time', 'driver_gender', 'driver_race',
       'violation_raw', 'violation', 'search_conducted', 'search_type',
       'stop_outcome', 'is_arrested', 'stop_duration', 'drugs_related_stop',
       'district'],
      dtype='object')

## Exploring the relationship between gender and policing

In [2]:
#Do races commit different violations
ri.stop_outcome.value_counts(normalize=True)
#Filtering DataFrame rows
white = ri[ri.drive_race =='white']
black = ri[ri.drive_race =='black']
hispanic = ri[ri.drive_race =='hispanic']
asian = ri[ri.drive_race =='asian']
#Comparing stop outcomes
white.stop_outcome.value_counts(normalize=True)
black.stop_outcome.value_counts(normalize=True)
hispanic.stop_outcome.value_counts(normalize=True)
asian.stop_outcome.value_counts(normalize=True)

#Do genders commit different violations
female = ri[ri.driver_gender == 'F']
male = ri[ri.driver_gender == 'M']
female.violation.value_counts(normalize=True)
male.violation.value_counts(normalize=True)

#Does gender affect who gets a ticket for speeding?
female_and_speeding = ri[(ri.driver_gender =='F') & (ri.stop_outcome =='Speeding')]
male_and_speeding = ri[(ri.driver_gender =='M') & (ri.stop_outcome =='Speeding')]
# Compute the stop outcomes for drivers (as proportions)
print(female_and_speeding.stop_outcome.value_counts(normalize=True))
print(male_and_speeding.stop_outcome.value_counts(normalize=True))

#Does gender affect whose vehicle is searched?
# Calculate the search rate for drivers
print(ri[(ri.driver_gender=='F')].search_conducted.mean())
print(ri[(ri.driver_gender=='M')].search_conducted.mean())
# Calculate the search rate for both groups simultaneously
print(ri.groupby(ri.driver_gender).search_conducted.mean())
# Calculate the search rate for each combination of gender and violation
print(ri.groupby(['driver_gender', 'violation']).search_conducted.mean())

#Does gender affect who is frisked during a search
# Count the 'search_type' values
print(ri.search_type.value_counts())
# Check if 'search_type' contains the string 'Protective Frisk'
ri['frisk'] = ri.search_type.str.contains('Protective Frisk', na=False)
# Check the data type of 'frisk'
print(ri.frisk.dtype)
# Take the sum of 'frisk'
print(ri.frisk.sum())
# Create a DataFrame of stops in which a search was conducted
searched = ri[ri.search_conducted == True]
# Calculate the overall frisk rate by taking the mean of 'frisk'
print(searched.frisk.mean())
# Calculate the frisk rate for each gender
print(searched.groupby('driver_gender').frisk.mean())


AttributeError: 'DataFrame' object has no attribute 'drive_race'

## Visual Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Does time of day affect arrest rate? (Single Variable over time)
# Calculate the overall arrest rate
print(ri.is_arrested.mean())
# Calculate the hourly arrest rate
print(ri.groupby(ri.index.hour).is_arrested.mean())
# Save the hourly arrest rate
hourly_arrest_rate = ri.groupby(ri.index.hour).is_arrested.mean()
# Create a line plot of 'hourly_arrest_rate'
hourly_arrest_rate.plot()
# Add the xlabel, ylabel, and title
plt.xlabel('Hour')
plt.ylabel('Arrest Rate')
plt.title('Arrest Rate by Time of Day')
# Display the plot
plt.show()

#Are drug-related stops on the rise? (Subplots for 2 variables over time)
#Change frequency of timeseries samples with resampling
# Calculate and save the annual rate of drug-related stops
annual_drug_rate = ri.drugs_related_stop.resample('A').mean()
# Calculate and save the annual search rate
annual_search_rate = ri.search_conducted.resample('A').mean()
# Concatenate 'annual_drug_rate' and 'annual_search_rate'
annual = pd.concat([annual_drug_rate, annual_search_rate], axis='columns')
# Create subplots from 'annual'
annual.plot(subplots=True)
# Display the subplots
plt.show()

#  What violations are caught in each district?
# Utilize frequency table with crosstab
# Frequency Table: Tally of how many times each combination of values occurs
# Create a frequency table of districts and violations
# Save the frequency table as 'all_zones'
all_zones = pd.crosstab(ri.district, ri.violation)
# Save the smaller table as 'k_zones'
k_zones = all_zones.loc['Zone K1':'Zone K3']
# Create a bar plot of 'k_zones'
k_zones.plot(kind='bar')
# Display the plot
plt.show()


#How often searches were done after each violation type?
search_rate = ri.groupby('violation').search_conducted.mean()
search_rate.sort_values().plot(kind='barh')
plt.show()

#How long might you be stopped for a violation?
#Translate strings into data to be evaluated numerically
# Create a dictionary that maps strings to integers
mapping = {'0-15 Min':8,'16-30 Min':23,'30+ Min':45}
# Convert the 'stop_duration' strings to integers using the 'mapping'
ri['stop_minutes'] = ri.stop_duration.map(mapping)
# Print the unique values in 'stop_minutes'
print(ri.stop_minutes.unique())
# Calculate the mean 'stop_minutes' for each value in 'violation_raw'
print(ri.groupby('violation_raw').stop_minutes.mean())
# Save the resulting Series as 'stop_length'
stop_length = ri.groupby('violation_raw').stop_minutes.mean()
# Sort 'stop_length' by its values and create a horizontal bar plot
stop_length.sort_values().plot(kind='barh')
# Display the plot
plt.show()


## Analyzing the effect of weather on policing
Data from NOAA National Centers for Environmental Information

In [None]:
import pandas as pd

# Read 'weather.csv' into a DataFrame named 'weather'
weather = pd.read_csv('https://assets.datacamp.com/production/repositories/1497/datasets/02f3fb2d4416d3f6626e1117688e0386784e8e55/weather.csv')

# Describe the temperature columns
print(weather[['TMIN','TAVG','TMAX']].describe())
# Create a box plot of the temperature columns
weather[['TMIN','TAVG','TMAX']].plot(kind='box')
# Display the plot
plt.show()

# Create a 'TDIFF' column that represents temperature difference
weather['TDIFF'] = weather['TMAX']-weather['TMIN']
# Describe the 'TDIFF' column
print(weather['TDIFF'].describe())
# Create a histogram with 20 bins to visualize 'TDIFF'
weather['TDIFF'].plot(kind='hist', bins=20)
# Display the plot
plt.show()

#Categorizing the weather
# Copy 'WT01' through 'WT22' to a new DataFrame
WT = weather.loc[:,'WT01':'WT22']
# Calculate the sum of each row in 'WT'
weather['bad_conditions'] = WT.sum(axis='columns')
# Replace missing values in 'bad_conditions' with '0'
weather['bad_conditions'] = weather.bad_conditions.fillna(0).astype('int')
# Create a histogram to visualize 'bad_conditions'
weather['bad_conditions'].plot(kind='hist')
# Display the plot
plt.show()

# Count the unique values in 'bad_conditions' and sort the index
print(weather.bad_conditions.value_counts().sort_index())
# Create a dictionary that maps integers to strings
mapping = {0:'good', 1:'bad', 2:'bad', 3:'bad',4:'bad',5:'worse',6:'worse',7:'worse',8:'worse',9:'worse'}
# Convert the 'bad_conditions' integers to strings using the 'mapping'
weather['rating'] = weather.bad_conditions.map(mapping)
# Count the unique values in 'rating'
print(weather['rating'].value_counts())

#Changing data type from object to category to save memory
#Inital check of memeory used
weather['rating'].memory_usage(deep=True)
# Create a list of weather ratings in logical order
cats = ['good','bad', 'worse']
# Change the data type of 'rating' to category
weather['rating'] = weather.rating.astype('category', ordered=True, categories=cats)
# Check memory improvements
wether['rating'].memory_usage(deep=True)

#Merging Datasets
# Reset the index of 'ri'
ri.reset_index(inplace=True)
# Examine the head of 'ri'
print(ri.head())
# Create a DataFrame from the 'DATE' and 'rating' columns
weather_rating = weather[['DATE','rating']]
# Examine the head of 'weather_rating'
print(weather_rating.head())
# Examine the shape of 'ri'
print(ri.shape)
# Merge 'ri' and 'weather_rating' using a left join
ri_weather = pd.merge(left=ri, right=weather_rating, left_on='stop_date', right_on='DATE', how='inner')
# Examine the shape of 'ri_weather'
print(ri_weather.shape)
# Set 'stop_datetime' as the index of 'ri_weather'
ri_weather.set_index('stop_datetime', inplace=True)

#Does Weather affect the arrest rate?
# Calculate the overall arrest rate
print(ri_weather.is_arrested.mean())
# Calculate the arrest rate for each 'rating'
print(ri_weather.groupby('rating').is_arrested.mean())
# Calculate the arrest rate for each 'violation' and 'rating'
print(ri_weather.groupby(['violation','rating']).is_arrested.mean())
# Save the output of the groupby operation from the last exercise
arrest_rate = ri_weather.groupby(['violation', 'rating']).is_arrested.mean()
# Print the 'arrest_rate' Series
print(arrest_rate)
# Print the arrest rate for moving violations in bad weather
print(arrest_rate.loc['Moving violation', 'bad'])
# Print the arrest rates for speeding violations in all three weather conditions
print(arrest_rate.loc['Speeding'])
# Unstack the 'arrest_rate' Series into a DataFrame
print(arrest_rate.unstack())
# Create the same DataFrame using a pivot table
print(ri_weather.pivot_table(index='violation', columns='driver_gender', values='is_arrested'))