In [3]:
# Import the Pandas Library to the notebook
import pandas as pd

### Grouping and aggregate

In [4]:
# Import the police.csv file into df
df = pd.read_csv("../Python-DataFiles/Lesson 5/police.csv",                   
                  engine="python",
                  parse_dates=["stop_date"],
                  dayfirst=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91741 entries, 0 to 91740
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   stop_date           91741 non-null  datetime64[ns]
 1   stop_time           91741 non-null  object        
 2   county_name         0 non-null      float64       
 3   driver_gender       86406 non-null  object        
 4   driver_age_raw      86414 non-null  float64       
 5   driver_age          86120 non-null  float64       
 6   driver_race         86408 non-null  object        
 7   violation_raw       86408 non-null  object        
 8   violation           86408 non-null  object        
 9   search_conducted    91741 non-null  bool          
 10  search_type         3196 non-null   object        
 11  stop_outcome        86408 non-null  object        
 12  is_arrested         86408 non-null  object        
 13  ticket_amount       1082 non-null   float64   

In [5]:
# Create a new column called stop_year, which will contain only the year a vehicle
# was stopped by traffic police
df["stop_year"] = df["stop_date"].dt.year


In [6]:
# Add a column called stop_month that will contain the month the vehicle was
# stopped by traffic police
df["stop_month"] = df["stop_date"].dt.month


In [7]:
# Check that the columns have been added correctly and that they contain the
# correct information
df[["stop_date","stop_year","stop_month"]]

Unnamed: 0,stop_date,stop_year,stop_month
0,2005-01-02,2005,1
1,2005-01-18,2005,1
2,2005-01-23,2005,1
3,2005-02-20,2005,2
4,2005-03-14,2005,3
...,...,...,...
91736,2015-12-31,2015,12
91737,2015-12-31,2015,12
91738,2015-12-31,2015,12
91739,2015-12-31,2015,12


In [8]:
# Group the data by year, and save into a new object called g_year
g_year = df.groupby("stop_year")

In [9]:
# Use the g_year object and show the three years in which the most tickets were
# given, and the amount of tickets given each year
g_year.size().sort_values(ascending= False).head(3)


stop_year
2012    10970
2006    10639
2007     9476
dtype: int64

In [10]:
# How many traffic offenses were committed each year?
g_year.size()

stop_year
2005     2558
2006    10639
2007     9476
2008     8752
2009     7908
2010     7561
2011     8126
2012    10970
2013     7924
2014     9228
2015     8599
dtype: int64

In [11]:
# What is the average number of the tickets for that year (ticket_amount)?
g_year["ticket_amount"].mean()


stop_year
2005    233.333333
2006    225.960000
2007    201.565217
2008    200.785714
2009    219.866667
2010    201.263158
2011    218.774510
2012    219.386243
2013    224.380000
2014    207.813953
2015    219.875000
Name: ticket_amount, dtype: float64

In [12]:
# What is the total monetary amount of tickets for that year?
g_year["ticket_amount"].sum()

stop_year
2005     21000.0
2006      5649.0
2007      4636.0
2008      2811.0
2009      9894.0
2010      3824.0
2011     22315.0
2012    124392.0
2013     11219.0
2014      8936.0
2015     22867.0
Name: ticket_amount, dtype: float64

In [13]:
# Save the result from the operation of the previous section in an object named
# temp_df
temp_df = g_year.agg(
    count=('ticket_amount', 'size'),
    mean=('ticket_amount', 'mean'),
    sum=('ticket_amount', 'sum')
)
temp_df


Unnamed: 0_level_0,count,mean,sum
stop_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005,2558,233.333333,21000.0
2006,10639,225.96,5649.0
2007,9476,201.565217,4636.0
2008,8752,200.785714,2811.0
2009,7908,219.866667,9894.0
2010,7561,201.263158,3824.0
2011,8126,218.77451,22315.0
2012,10970,219.386243,124392.0
2013,7924,224.38,11219.0
2014,9228,207.813953,8936.0


In [14]:
# Look at the temp_df and answer the following questions:
# a. What is the total monetary amount of tickets issued in 2010?
temp_df.loc[2010,"sum"]

# b. What is the average monetary amount of the tickets given in 2005?
temp_df.loc[2005,"mean"]

# c. What is the total number of tickets issued in 2013?
temp_df.loc[2013,"count"]


7924

In [15]:
# Sort temp_df by the values in the sum column in descending order.
# Which year has the highest monetary total of tickets? ans: 2012
temp_df.sort_values(by='sum', ascending=False)


Unnamed: 0_level_0,count,mean,sum
stop_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012,10970,219.386243,124392.0
2015,8599,219.875,22867.0
2011,8126,218.77451,22315.0
2005,2558,233.333333,21000.0
2013,7924,224.38,11219.0
2009,7908,219.866667,9894.0
2014,9228,207.813953,8936.0
2006,10639,225.96,5649.0
2007,9476,201.565217,4636.0
2010,7561,201.263158,3824.0


In [16]:
# Sort temp_df by the values in the mean column in descending order.
# Which year has the lowest average amount of tickets? Ans: 2008
temp_df.sort_values(by="mean")

Unnamed: 0_level_0,count,mean,sum
stop_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008,8752,200.785714,2811.0
2010,7561,201.263158,3824.0
2007,9476,201.565217,4636.0
2014,9228,207.813953,8936.0
2011,8126,218.77451,22315.0
2012,10970,219.386243,124392.0
2009,7908,219.866667,9894.0
2015,8599,219.875,22867.0
2013,7924,224.38,11219.0
2006,10639,225.96,5649.0


In [20]:
# Use temp_df, and calculate what is the average of the sum of the tickets of all the
# months?
# Save the answer into a variable called mean_years_ticket
temp_df["mean"].mean()

215.72761784093572