# Task 13: Data manipulation with Pandas (indexing, selection, grouping)

# Dataset 1 --> data.csv

In [1]:
import pandas as pd
import numpy as np

### Load the CSV file into a DataFrame

In [2]:
file_path = 'data.csv'
df = pd.read_csv(file_path)

### Set 'Date' as the index

In [3]:
df.set_index('Date', inplace=True)
print("DataFrame with 'Date' as index:\n", df)

DataFrame with 'Date' as index:
               Duration  Pulse  Maxpulse  Calories
Date                                             
'2020/12/01'        60    110       130     409.1
'2020/12/02'        60    117       145     479.0
'2020/12/03'        60    103       135     340.0
'2020/12/04'        45    109       175     282.4
'2020/12/05'        45    117       148     406.0
'2020/12/06'        60    102       127     300.0
'2020/12/07'        60    110       136     374.0
'2020/12/08'       450    104       134     253.3
'2020/12/09'        30    109       133     195.1
'2020/12/10'        60     98       124     269.0
'2020/12/11'        60    103       147     329.3
'2020/12/12'        60    100       120     250.7
'2020/12/12'        60    100       120     250.7
'2020/12/13'        60    106       128     345.3
'2020/12/14'        60    104       132     379.3
'2020/12/15'        60     98       123     275.0
'2020/12/16'        60     98       120     215.2
'2020/12/17'     

### Display the first and last five rows

In [4]:
print("First five rows:\n", df.head())
print("Last five rows:\n", df.tail())

First five rows:
               Duration  Pulse  Maxpulse  Calories
Date                                             
'2020/12/01'        60    110       130     409.1
'2020/12/02'        60    117       145     479.0
'2020/12/03'        60    103       135     340.0
'2020/12/04'        45    109       175     282.4
'2020/12/05'        45    117       148     406.0
Last five rows:
               Duration  Pulse  Maxpulse  Calories
Date                                             
'2020/12/27'        60     92       118     241.0
'2020/12/28'        60    103       132       NaN
'2020/12/29'        60    100       132     280.0
'2020/12/30'        60    102       129     380.3
'2020/12/31'        60     92       115     243.0


### Set 'Duration' as the index

In [5]:
df.set_index('Duration', inplace=True)
print("DataFrame with 'Duration' as index:\n", df)

DataFrame with 'Duration' as index:
           Pulse  Maxpulse  Calories
Duration                           
60          110       130     409.1
60          117       145     479.0
60          103       135     340.0
45          109       175     282.4
45          117       148     406.0
60          102       127     300.0
60          110       136     374.0
450         104       134     253.3
30          109       133     195.1
60           98       124     269.0
60          103       147     329.3
60          100       120     250.7
60          100       120     250.7
60          106       128     345.3
60          104       132     379.3
60           98       123     275.0
60           98       120     215.2
60          100       120     300.0
45           90       112       NaN
60          103       123     323.0
45           97       125     243.0
60          108       131     364.2
45          100       119     282.0
60          130       101     300.0
45          105       132  

### Select and display 'Pulse' column

In [6]:
pulse_column = df['Pulse']
print("Pulse Column:\n", pulse_column)

Pulse Column:
 Duration
60     110
60     117
60     103
45     109
45     117
60     102
60     110
450    104
30     109
60      98
60     103
60     100
60     100
60     106
60     104
60      98
60      98
60     100
45      90
60     103
45      97
60     108
45     100
60     130
45     105
60     102
60     100
60      92
60     103
60     100
60     102
60      92
Name: Pulse, dtype: int64


### Select and display 'Pulse' and 'Calories' columns

In [7]:
pulse_calories_columns = df[['Pulse', 'Calories']]
print("Pulse and Calories Columns:\n", pulse_calories_columns)

Pulse and Calories Columns:
           Pulse  Calories
Duration                 
60          110     409.1
60          117     479.0
60          103     340.0
45          109     282.4
45          117     406.0
60          102     300.0
60          110     374.0
450         104     253.3
30          109     195.1
60           98     269.0
60          103     329.3
60          100     250.7
60          100     250.7
60          106     345.3
60          104     379.3
60           98     275.0
60           98     215.2
60          100     300.0
45           90       NaN
60          103     323.0
45           97     243.0
60          108     364.2
45          100     282.0
60          130     300.0
45          105     246.0
60          102     334.5
60          100     250.0
60           92     241.0
60          103       NaN
60          100     280.0
60          102     380.3
60           92     243.0


### Select a subset of rows and columns using .iloc

In [8]:
subset_iloc = df.iloc[:3, [1, 2]]
print("Subset of Rows and Columns using .iloc:\n", subset_iloc)

Subset of Rows and Columns using .iloc:
           Maxpulse  Calories
Duration                    
60             130     409.1
60             145     479.0
60             135     340.0


### Filter rows where 'Calories' > 400

In [9]:

filtered_calories = df[df['Calories'] > 400]
print("Filtered Rows with Calories > 400:\n", filtered_calories)

Filtered Rows with Calories > 400:
           Pulse  Maxpulse  Calories
Duration                           
60          110       130     409.1
60          117       145     479.0
45          117       148     406.0


### Filter rows where 'Pulse' > 100 and 'Maxpulse' < 150

In [10]:
filtered_pulse_maxpulse = df[(df['Pulse'] > 100) & (df['Maxpulse'] < 150)]
print("Filtered Rows with Pulse > 100 and Maxpulse < 150:\n", filtered_pulse_maxpulse)

Filtered Rows with Pulse > 100 and Maxpulse < 150:
           Pulse  Maxpulse  Calories
Duration                           
60          110       130     409.1
60          117       145     479.0
60          103       135     340.0
45          117       148     406.0
60          102       127     300.0
60          110       136     374.0
450         104       134     253.3
30          109       133     195.1
60          103       147     329.3
60          106       128     345.3
60          104       132     379.3
60          103       123     323.0
60          108       131     364.2
60          130       101     300.0
45          105       132     246.0
60          102       126     334.5
60          103       132       NaN
60          102       129     380.3


### Use query method to filter rows where 'Calories' > 400

In [11]:
query_filtered = df.query('Calories > 400')
print("Filtered Rows using query method:\n", query_filtered)

Filtered Rows using query method:
           Pulse  Maxpulse  Calories
Duration                           
60          110       130     409.1
60          117       145     479.0
45          117       148     406.0


### Use isin to filter rows where 'Pulse' is in a list of values

In [12]:
pulse_values_list = [100, 110, 120]
filtered_isin = df[df['Pulse'].isin(pulse_values_list)]
print("Filtered Rows using isin for Pulse:\n", filtered_isin)

Filtered Rows using isin for Pulse:
           Pulse  Maxpulse  Calories
Duration                           
60          110       130     409.1
60          110       136     374.0
60          100       120     250.7
60          100       120     250.7
60          100       120     300.0
45          100       119     282.0
60          100       120     250.0
60          100       132     280.0


### Group by 'Duration' and calculate mean for each group

In [13]:
grouped_mean_duration = df.groupby('Duration').mean()
print("Grouped by Duration and Mean:\n", grouped_mean_duration)

Grouped by Duration and Mean:
             Pulse    Maxpulse   Calories
Duration                                
30        109.000  133.000000  195.10000
45        103.000  135.166667  291.88000
60        103.375  126.416667  314.46087
450       104.000  134.000000  253.30000


### Group by 'Duration' and 'Pulse' and calculate sum for each group

In [14]:
grouped_sum_duration_pulse = df.groupby(['Duration', 'Pulse']).sum()
print("Grouped by Duration and Pulse and Sum:\n", grouped_sum_duration_pulse)

Grouped by Duration and Pulse and Sum:
                 Maxpulse  Calories
Duration Pulse                    
30       109         133     195.1
45       90          112       0.0
         97          125     243.0
         100         119     282.0
         105         132     246.0
         109         175     282.4
         117         148     406.0
60       92          233     484.0
         98          367     759.2
         100         612    1331.4
         102         382    1014.8
         103         537     992.3
         104         132     379.3
         106         128     345.3
         108         131     364.2
         110         266     783.1
         117         145     479.0
         130         101     300.0
450      104         134     253.3


### Use agg method to apply multiple aggregation functions

In [15]:
aggregated = df.groupby('Duration').agg({
    'Pulse': ['sum', 'mean'],
    'Maxpulse': 'max',
    'Calories': 'mean'
})
print("Aggregated Data:\n", aggregated)

Aggregated Data:
          Pulse          Maxpulse   Calories
           sum     mean      max       mean
Duration                                   
30         109  109.000      133  195.10000
45         618  103.000      175  291.88000
60        2481  103.375      147  314.46087
450        104  104.000      134  253.30000


### Calculate the size of each group based on 'Duration'

In [16]:
group_size = df.groupby('Duration').size()
print("Size of Each Group:\n", group_size)

Size of Each Group:
 Duration
30      1
45      6
60     24
450     1
dtype: int64


### Select specific columns and rename them

In [17]:
renamed_columns = df[['Pulse', 'Maxpulse']].rename(columns={
    'Pulse': 'HeartRate',
    'Maxpulse': 'PeakHeartRate'
})
print("Selected and Renamed Columns:\n", renamed_columns)

Selected and Renamed Columns:
           HeartRate  PeakHeartRate
Duration                          
60              110            130
60              117            145
60              103            135
45              109            175
45              117            148
60              102            127
60              110            136
450             104            134
30              109            133
60               98            124
60              103            147
60              100            120
60              100            120
60              106            128
60              104            132
60               98            123
60               98            120
60              100            120
45               90            112
60              103            123
45               97            125
60              108            131
45              100            119
60              130            101
45              105            132
60              102     

# Dataset 2 --> output.csv

### Load the CSV file into a DataFrame

In [18]:
file_path = 'output.csv'
df = pd.read_csv(file_path)

### Display the first and last five rows

In [19]:
print("First five rows:\n", df.head())
print("Last five rows:\n", df.tail())

First five rows:
    Feature1  Feature2  Feature3 Feature4  Feature5  Feature6  Feature7  \
0      44.0  0.797417 -0.958091      NaN       NaN     158.0  1.490148   
1      47.0  0.026838 -0.244178        D       NaN       NaN  2.151535   
2      64.0  0.199990 -1.014799        B       0.0     474.0  1.410613   
3      67.0  0.474124  0.448040        B       NaN     166.0  0.635091   
4       NaN  0.598516  0.087839      NaN       0.0       NaN  0.392104   

   Feature8  Feature9  Feature10  Feature11 Feature12  Feature13  Feature14  \
0  5.166124  2.277942        NaN          5         X          1  Category1   
1  0.450312 -3.395356        NaN          3         X          0  Category1   
2  6.786555 -3.070746        NaN          7         Z          0  Category1   
3  1.937158 -0.246269   6.004975         20         X          0  Category1   
4  4.010937  3.215086   2.891199          3         Z          0  Category1   

  Feature15  Feature16  Feature17 Feature18 Feature19 Feature2

### Set 'Feature4' as the index

In [20]:
df.set_index('Feature4', inplace=True)
print("DataFrame with 'Feature4' as index:\n", df)

DataFrame with 'Feature4' as index:
           Feature1  Feature2  Feature3  Feature5  Feature6  Feature7  \
Feature4                                                               
NaN           44.0  0.797417 -0.958091       NaN     158.0  1.490148   
D             47.0  0.026838 -0.244178       NaN       NaN  2.151535   
B             64.0  0.199990 -1.014799       0.0     474.0  1.410613   
B             67.0  0.474124  0.448040       NaN     166.0  0.635091   
NaN            NaN  0.598516  0.087839       0.0       NaN  0.392104   
...            ...       ...       ...       ...       ...       ...   
C              NaN  0.819440  0.415931       0.0       NaN  1.765590   
C              NaN  0.032140 -1.445420       1.0       NaN  0.806246   
C             92.0  0.263300  0.986722       NaN      78.0  1.326853   
A             91.0  0.640986 -1.199847       0.0      51.0  0.312256   
NaN            NaN  0.901570  1.836382       0.0       NaN  2.087461   

          Feature8  Featur

### Select and display 'Feature2' column

In [21]:
feature2_column = df['Feature2']
print("Feature2 Column:\n", feature2_column)

Feature2 Column:
 Feature4
NaN    0.797417
D      0.026838
B      0.199990
B      0.474124
NaN    0.598516
         ...   
C      0.819440
C      0.032140
C      0.263300
A      0.640986
NaN    0.901570
Name: Feature2, Length: 5000, dtype: float64


### Select and display 'Feature2' and 'Feature5' columns

In [22]:
feature2_5_columns = df[['Feature2', 'Feature5']]
print("Feature2 and Feature5 Columns:\n", feature2_5_columns)

Feature2 and Feature5 Columns:
           Feature2  Feature5
Feature4                    
NaN       0.797417       NaN
D         0.026838       NaN
B         0.199990       0.0
B         0.474124       NaN
NaN       0.598516       0.0
...            ...       ...
C         0.819440       0.0
C         0.032140       1.0
C         0.263300       NaN
A         0.640986       0.0
NaN       0.901570       0.0

[5000 rows x 2 columns]


### Select a subset of rows using .loc (using actual index labels)

In [23]:
subset_loc = df.loc[['A', 'B']]
print("Subset of Rows using .loc:\n", subset_loc)

Subset of Rows using .loc:
           Feature1  Feature2  Feature3  Feature5    Feature6  Feature7  \
Feature4                                                                 
A              NaN  0.087169 -0.293628       0.0  587.000000  2.428588   
A              NaN  0.204467 -0.988010       NaN         NaN  3.601679   
A              NaN  1.119411  0.899467       0.0  156.505327  0.367481   
A              NaN  0.689365  0.046061       1.0  818.000000  0.384104   
A             88.0  0.113432  0.255512       NaN  552.000000  0.709581   
...            ...       ...       ...       ...         ...       ...   
B              NaN  0.618159  0.512165       0.0  923.000000  2.090462   
B              NaN  0.549802 -0.185693       0.0         NaN  0.969918   
B             58.0  0.613667  1.505314       0.0         NaN  1.333325   
B              NaN  0.245673  1.273752       0.0  133.000000  1.245834   
B              3.0  0.816137 -0.204571       0.0  531.000000  4.383736   

         

### Select a subset of rows and columns using .iloc

In [24]:
subset_iloc = df.iloc[:5, [1, 2, 5]]
print("Subset of Rows and Columns using .iloc:\n", subset_iloc)

Subset of Rows and Columns using .iloc:
           Feature2  Feature3  Feature7
Feature4                              
NaN       0.797417 -0.958091  1.490148
D         0.026838 -0.244178  2.151535
B         0.199990 -1.014799  1.410613
B         0.474124  0.448040  0.635091
NaN       0.598516  0.087839  0.392104


### Filter rows where 'Feature5' > 50

In [25]:
filtered_feature5 = df[df['Feature5'] > 50]
print("Filtered Rows with Feature5 > 50:\n", filtered_feature5)

Filtered Rows with Feature5 > 50:
 Empty DataFrame
Columns: [Feature1, Feature2, Feature3, Feature5, Feature6, Feature7, Feature8, Feature9, Feature10, Feature11, Feature12, Feature13, Feature14, Feature15, Feature16, Feature17, Feature18, Feature19, Feature20]
Index: []


### Filter rows where 'Feature2' > 20 and 'Feature3' < 100

In [26]:
filtered_feature2_3 = df[(df['Feature2'] > 20) & (df['Feature3'] < 100)]
print("Filtered Rows with Feature2 > 20 and Feature3 < 100:\n", filtered_feature2_3)

Filtered Rows with Feature2 > 20 and Feature3 < 100:
 Empty DataFrame
Columns: [Feature1, Feature2, Feature3, Feature5, Feature6, Feature7, Feature8, Feature9, Feature10, Feature11, Feature12, Feature13, Feature14, Feature15, Feature16, Feature17, Feature18, Feature19, Feature20]
Index: []


### Use query method to filter rows where 'Feature7' > 150

In [27]:
query_filtered = df.query('Feature7 > 150')
print("Filtered Rows using query method:\n", query_filtered)

Filtered Rows using query method:
 Empty DataFrame
Columns: [Feature1, Feature2, Feature3, Feature5, Feature6, Feature7, Feature8, Feature9, Feature10, Feature11, Feature12, Feature13, Feature14, Feature15, Feature16, Feature17, Feature18, Feature19, Feature20]
Index: []


### Use isin to filter rows where 'Feature12' is in a list of values

In [28]:
feature12_values_list = ['value1', 'value2']
filtered_isin = df[df['Feature12'].isin(feature12_values_list)]
print("Filtered Rows using isin for Feature12:\n", filtered_isin)

Filtered Rows using isin for Feature12:
 Empty DataFrame
Columns: [Feature1, Feature2, Feature3, Feature5, Feature6, Feature7, Feature8, Feature9, Feature10, Feature11, Feature12, Feature13, Feature14, Feature15, Feature16, Feature17, Feature18, Feature19, Feature20]
Index: []


### Group by 'Feature1' and 'Feature2' and calculate sum for each group

In [29]:
grouped_sum_feature1_2 = df.groupby(['Feature1', 'Feature2']).sum()
print("Grouped by Feature1 and Feature2 and Sum:\n", grouped_sum_feature1_2)

Grouped by Feature1 and Feature2 and Sum:
                     Feature3  Feature5    Feature6  Feature7   Feature8  \
Feature1 Feature2                                                         
0.0      -0.190275  1.024529       0.0  112.684295  0.812273   0.656600   
          0.058606 -0.928581       0.0    0.000000  0.511888   4.098720   
          0.106766 -1.490766       0.0    0.000000  2.308925  10.002186   
          0.162696  1.883988       1.0  358.000000  1.035700   7.842196   
          0.194193 -0.608219       0.0  726.000000  0.564594   6.532264   
...                      ...       ...         ...       ...        ...   
99.0      0.806087  0.673348       0.0  847.528266  2.311998   3.654311   
          0.807822  0.764659       0.0  352.000000  1.158098   2.892907   
          0.941835 -1.249558       0.0    0.000000  0.397433   1.313112   
          0.990978 -1.648870       0.0    0.000000  0.237551   4.386257   
          1.252842 -0.076669       0.0   -5.590177  2.946

### Use agg method to apply multiple aggregation functions

In [30]:
aggregated = df.groupby('Feature1').agg({
    'Feature2': ['sum', 'mean'],
    'Feature3': 'max',
    'Feature5': 'mean'
})
print("Aggregated Data:\n", aggregated)

Aggregated Data:
            Feature2            Feature3  Feature5
                sum      mean       max      mean
Feature1                                         
0.0       18.871881  0.571875  2.020432  0.166667
1.0        9.733999  0.608375  1.428359  0.000000
2.0       10.301820  0.447905  1.551123  0.157895
3.0       16.557802  0.551927  2.490175  0.043478
4.0       11.337844  0.419920  1.507062  0.000000
...             ...       ...       ...       ...
95.0       7.755779  0.484736  1.378524  0.000000
96.0      10.133454  0.633341  1.462408  0.125000
97.0      10.398584  0.547294  2.120759  0.000000
98.0      12.186794  0.487472  1.589750  0.000000
99.0      10.063578  0.503179  1.442582  0.000000

[100 rows x 4 columns]


### Calculate the size of each group based on 'Feature1'

In [31]:
group_size = df.groupby('Feature1').size()
print("Size of Each Group:\n", group_size)

Size of Each Group:
 Feature1
0.0     33
1.0     16
2.0     23
3.0     30
4.0     27
        ..
95.0    16
96.0    16
97.0    19
98.0    25
99.0    20
Length: 100, dtype: int64


### Select specific columns and rename them

In [32]:
renamed_columns = df[['Feature2', 'Feature3']].rename(columns={
    'Feature2': 'NewFeature2',
    'Feature3': 'NewFeature3'
})
print("Selected and Renamed Columns:\n", renamed_columns)

Selected and Renamed Columns:
           NewFeature2  NewFeature3
Feature4                          
NaN          0.797417    -0.958091
D            0.026838    -0.244178
B            0.199990    -1.014799
B            0.474124     0.448040
NaN          0.598516     0.087839
...               ...          ...
C            0.819440     0.415931
C            0.032140    -1.445420
C            0.263300     0.986722
A            0.640986    -1.199847
NaN          0.901570     1.836382

[5000 rows x 2 columns]


# Dataset 3 --> police.csv

### Load the CSV file into a DataFrame

In [33]:
file_path = 'police.csv'
df = pd.read_csv(file_path)

### Function to safely convert dates

In [34]:
def safe_date_conversion(date_str):
    for fmt in ("%d/%m/%Y", "%d-%m-%Y", "%Y-%d-%m", "%m/%d/%Y"):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except (ValueError, TypeError):
            continue
    return pd.NaT  # If all conversions fail, return NaT

### Apply the function to convert dates

In [35]:
df['stop_date'] = df['stop_date'].apply(safe_date_conversion)

### Remove rows where date conversion failed

In [36]:
df.dropna(subset=['stop_date'], inplace=True)

### Ensure 'stop_date' is in datetime format and set as index

In [37]:
df.set_index('stop_date', inplace=True)

### Display the first and last five rows of the DataFrame

In [38]:
print("First five rows:\n", df.head())
print("Last five rows:\n", df.tail())

First five rows:
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-02-01     01:55          NaN             M          1985.0        20.0   
2005-01-04     17:30          NaN             M          1969.0        36.0   
2005-06-06     13:20          NaN             F          1986.0        19.0   
2005-07-08     16:50          NaN             M          1953.0        52.0   
2005-08-08     02:09          NaN             M          1955.0        50.0   

           driver_race            violation_raw         violation  \
stop_date                                                           
2005-02-01       White                 Speeding          Speeding   
2005-01-04       White                 Speeding          Speeding   
2005-06-06       White                 Speeding          Speeding   
2005-07-08       White  Other Traffic Violation  Moving violation   
2005-08-08    

In [39]:
# Select and display 'violation' column
violation_column = df['violation']
print("Violation Column:\n", violation_column)

Violation Column:
 stop_date
2005-02-01            Speeding
2005-01-04            Speeding
2005-06-06            Speeding
2005-07-08    Moving violation
2005-08-08    Moving violation
                    ...       
2015-12-12    Moving violation
2015-12-12            Speeding
2015-12-12           Equipment
2015-12-12            Speeding
2015-12-12            Speeding
Name: violation, Length: 36585, dtype: object


In [40]:
# Select and display 'driver_gender' and 'stop_outcome' columns
selected_columns = df[['driver_gender', 'stop_outcome']]
print("Selected Columns (driver_gender and stop_outcome):\n", selected_columns)

Selected Columns (driver_gender and stop_outcome):
            driver_gender   stop_outcome
stop_date                              
2005-02-01             M       Citation
2005-01-04             M       Citation
2005-06-06             F       Citation
2005-07-08             M       Citation
2005-08-08             M  Arrest Driver
...                  ...            ...
2015-12-12             M       Citation
2015-12-12             M       Citation
2015-12-12             F       Citation
2015-12-12             F       Citation

[36585 rows x 2 columns]


In [41]:
# Select a subset of rows using .loc
# Use dates that are present in the dataset
available_dates = df.index.unique()[:3].strftime('%Y-%m-%d').tolist()
subset_loc = df.loc[available_dates]
print("Subset of Rows using .loc:\n", subset_loc)

Subset of Rows using .loc:
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-02-01     01:55          NaN             M          1985.0        20.0   
2005-01-04     17:30          NaN             M          1969.0        36.0   
2005-06-06     13:20          NaN             F          1986.0        19.0   

           driver_race violation_raw violation  search_conducted search_type  \
stop_date                                                                      
2005-02-01       White      Speeding  Speeding             False         NaN   
2005-01-04       White      Speeding  Speeding             False         NaN   
2005-06-06       White      Speeding  Speeding             False         NaN   

           stop_outcome is_arrested stop_duration  drugs_related_stop  
stop_date                                                              
2005-02-01     Citation       F

### Select a subset of rows and columns using .iloc

In [42]:
subset_iloc = df.iloc[:3, [1, 3, 5]]
print("Subset of Rows and Columns using .iloc:\n", subset_iloc)

Subset of Rows and Columns using .iloc:
             county_name  driver_age_raw driver_race
stop_date                                          
2005-02-01          NaN          1985.0       White
2005-01-04          NaN          1969.0       White
2005-06-06          NaN          1986.0       White


### Filter rows where 'driver_gender' is 'F'

In [43]:
filtered_gender = df[df['driver_gender'] == 'F']
print("Filtered Rows with driver_gender == 'F':\n", filtered_gender)

Filtered Rows with driver_gender == 'F':
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-06-06     13:20          NaN             F          1986.0        19.0   
2005-01-10     17:05          NaN             F          1960.0        45.0   
2005-01-10     19:15          NaN             F          1981.0        24.0   
2005-01-10     23:10          NaN             F          1986.0        19.0   
2005-02-10     00:00          NaN             F          1982.0        23.0   
...              ...          ...           ...             ...         ...   
2015-12-12     11:48          NaN             F          1995.0        20.0   
2015-12-12     20:18          NaN             F          1989.0        26.0   
2015-12-12     21:44          NaN             F          1995.0        20.0   
2015-12-12     23:33          NaN             F          1996.0        19.0   
2015-12-12

### Group by 'violation' and calculate the mean of each group

In [44]:
grouped_mean_violation = df.groupby('violation').mean(numeric_only=True)
print("Grouped by Violation and Mean:\n", grouped_mean_violation)

Grouped by Violation and Mean:
                      county_name  driver_age_raw  driver_age  \
violation                                                      
Equipment                    NaN     1978.169895   31.979947   
Moving violation             NaN     1967.321821   35.917289   
Other                        NaN     1921.379392   39.584352   
Registration/plates          NaN     1976.485502   32.746835   
Seat belt                    NaN     1982.384468   31.755513   
Speeding                     NaN     1974.054087   33.474198   

                     search_conducted  drugs_related_stop  
violation                                                  
Equipment                    0.068121            0.020036  
Moving violation             0.059935            0.017965  
Other                        0.047647            0.005956  
Registration/plates          0.092937            0.012639  
Seat belt                    0.037392            0.019175  
Speeding                     0.0204

### Group by 'driver_gender' and 'violation' and calculate the sum of each group

In [45]:
grouped_sum_gender_violation = df.groupby(['driver_gender', 'violation']).sum(numeric_only=True)
print("Grouped by driver_gender and violation and Sum:\n", grouped_sum_gender_violation)

Grouped by driver_gender and violation and Sum:
                                    county_name  driver_age_raw  driver_age  \
driver_gender violation                                                      
F             Equipment                    0.0       2058838.0     33339.0   
              Moving violation             0.0       2575808.0     44436.0   
              Other                        0.0        551429.0      9630.0   
              Registration/plates          0.0        757845.0     12385.0   
              Seat belt                    0.0        464284.0      7018.0   
              Speeding                     0.0      12374053.0    203890.0   
M             Equipment                    0.0       6825123.0    110187.0   
              Moving violation             0.0      10127189.0    186584.0   
              Other                        0.0       2674567.0     55130.0   
              Registration/plates          0.0       1900528.0     31594.0   
              S

### Use agg method to apply multiple aggregation functions to grouped data

In [46]:
aggregated = df.groupby('violation').agg({
    'driver_age': ['mean', 'max'],
    'search_conducted': 'sum',
    'drugs_related_stop': 'sum'
})
print("Aggregated Data:\n", aggregated)

Aggregated Data:
                     driver_age       search_conducted drugs_related_stop
                          mean   max              sum                sum
violation                                                               
Equipment            31.979947  85.0              306                 90
Moving violation     35.917289  99.0              387                116
Other                39.584352  78.0               80                 10
Registration/plates  32.746835  74.0              125                 17
Seat belt            31.755513  77.0               39                 20
Speeding             33.474198  88.0              397                 93


### Calculate the size of each group based on 'driver_race'

In [47]:
group_size_race = df.groupby('driver_race').size()
print("Size of Each Group (driver_race):\n", group_size_race)

Size of Each Group (driver_race):
 driver_race
Asian         914
Black        4875
Hispanic     3627
Other          99
White       24914
dtype: int64


### Select rows based on multiple conditions

In [48]:
filtered_multiple_conditions = df[(df['driver_age'] > 30) & (df['search_conducted'] == True)]
print("Filtered Rows with driver_age > 30 and search_conducted == True:\n", filtered_multiple_conditions)

Filtered Rows with driver_age > 30 and search_conducted == True:
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-04-10     01:20          NaN             M          1956.0        49.0   
2005-07-10     12:00          NaN             M          1947.0        58.0   
2005-10-10     16:00          NaN             M          1955.0        50.0   
2005-11-10     03:15          NaN             M          1964.0        41.0   
2005-11-10     13:40          NaN             M          1963.0        42.0   
...              ...          ...           ...             ...         ...   
2015-01-12     06:47          NaN             M          1960.0        55.0   
2015-02-12     09:35          NaN             M          1983.0        32.0   
2015-07-12     14:10          NaN             M          1976.0        39.0   
2015-12-12     16:46          NaN             M          1978.0  

### Use query method to filter rows where 'stop_duration' is '0-15 Min'

In [49]:
query_filtered = df.query('stop_duration == "0-15 Min"')
print("Filtered Rows using query method (stop_duration == '0-15 Min'):\n", query_filtered)

Filtered Rows using query method (stop_duration == '0-15 Min'):
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-02-01     01:55          NaN             M          1985.0        20.0   
2005-01-04     17:30          NaN             M          1969.0        36.0   
2005-06-06     13:20          NaN             F          1986.0        19.0   
2005-07-08     16:50          NaN             M          1953.0        52.0   
2005-01-10     01:00          NaN             M          1985.0        20.0   
...              ...          ...           ...             ...         ...   
2015-12-12     22:20          NaN             M          1992.0        23.0   
2015-12-12     22:49          NaN             M          1985.0        30.0   
2015-12-12     23:33          NaN             F          1996.0        19.0   
2015-12-12     23:34          NaN             M          1996.0   

### Use isin to filter rows where 'violation' is in a list of values

In [50]:
violation_values_list = ['Speeding', 'Equipment']
filtered_isin = df[df['violation'].isin(violation_values_list)]
print("Filtered Rows using isin for violation:\n", filtered_isin)

Filtered Rows using isin for violation:
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-02-01     01:55          NaN             M          1985.0        20.0   
2005-01-04     17:30          NaN             M          1969.0        36.0   
2005-06-06     13:20          NaN             F          1986.0        19.0   
2005-01-10     00:00          NaN             M          1988.0        17.0   
2005-01-10     00:00          NaN             M          1988.0        17.0   
...              ...          ...           ...             ...         ...   
2015-12-12     21:44          NaN             F          1995.0        20.0   
2015-12-12     22:49          NaN             M          1985.0        30.0   
2015-12-12     23:33          NaN             F          1996.0        19.0   
2015-12-12     23:34          NaN             M          1996.0        19.0   
2015-12-12 

### Select specific columns and rename them

In [51]:
renamed_columns = df[['driver_age', 'violation', 'stop_duration']].rename(columns={
    'driver_age': 'Age',
    'violation': 'Offense',
    'stop_duration': 'Duration'
})
print("Selected and Renamed Columns:\n", renamed_columns)

Selected and Renamed Columns:
              Age           Offense  Duration
stop_date                                   
2005-02-01  20.0          Speeding  0-15 Min
2005-01-04  36.0          Speeding  0-15 Min
2005-06-06  19.0          Speeding  0-15 Min
2005-07-08  52.0  Moving violation  0-15 Min
2005-08-08  50.0  Moving violation   30+ Min
...          ...               ...       ...
2015-12-12  23.0  Moving violation  0-15 Min
2015-12-12  30.0          Speeding  0-15 Min
2015-12-12  19.0         Equipment  0-15 Min
2015-12-12  19.0          Speeding  0-15 Min
2015-12-12  29.0          Speeding  0-15 Min

[36585 rows x 3 columns]


### Reset the index

In [52]:
df_reset = df.reset_index()
print("DataFrame after resetting the index:\n", df_reset.head())

DataFrame after resetting the index:
    stop_date stop_time  county_name driver_gender  driver_age_raw  driver_age  \
0 2005-02-01     01:55          NaN             M          1985.0        20.0   
1 2005-01-04     17:30          NaN             M          1969.0        36.0   
2 2005-06-06     13:20          NaN             F          1986.0        19.0   
3 2005-07-08     16:50          NaN             M          1953.0        52.0   
4 2005-08-08     02:09          NaN             M          1955.0        50.0   

  driver_race            violation_raw         violation  search_conducted  \
0       White                 Speeding          Speeding             False   
1       White                 Speeding          Speeding             False   
2       White                 Speeding          Speeding             False   
3       White  Other Traffic Violation  Moving violation             False   
4       Black  Other Traffic Violation  Moving violation             False   

  sear

### Sort the DataFrame by 'driver_age'

In [53]:
sorted_df = df.sort_values(by='driver_age')
print("DataFrame sorted by driver_age:\n", sorted_df.head())

DataFrame sorted by driver_age:
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2007-04-07     00:40          NaN             F          1992.0        15.0   
2007-11-06     12:30          NaN             M          1992.0        15.0   
2010-10-11     19:39          NaN             F          1995.0        15.0   
2015-06-09     10:48          NaN             M          1999.0        16.0   
2010-01-06     06:50          NaN             F          1994.0        16.0   

           driver_race                   violation_raw         violation  \
stop_date                                                                  
2007-04-07       White         Other Traffic Violation  Moving violation   
2007-11-06       White                        Speeding          Speeding   
2010-10-11       White         Other Traffic Violation  Moving violation   
2015-06-09       White           

### Add a new column calculated from existing columns

In [54]:
df['age_category'] = pd.cut(df['driver_age'], bins=[0, 20, 40, 60, 100], labels=['0-20', '21-40', '41-60', '61+'])
print("DataFrame with new 'age_category' column:\n", df.head())

DataFrame with new 'age_category' column:
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-02-01     01:55          NaN             M          1985.0        20.0   
2005-01-04     17:30          NaN             M          1969.0        36.0   
2005-06-06     13:20          NaN             F          1986.0        19.0   
2005-07-08     16:50          NaN             M          1953.0        52.0   
2005-08-08     02:09          NaN             M          1955.0        50.0   

           driver_race            violation_raw         violation  \
stop_date                                                           
2005-02-01       White                 Speeding          Speeding   
2005-01-04       White                 Speeding          Speeding   
2005-06-06       White                 Speeding          Speeding   
2005-07-08       White  Other Traffic Violation  Moving vi

### Drop rows with missing values

In [55]:
df_dropped_na = df.dropna()
print("DataFrame after dropping rows with missing values:\n", df_dropped_na.head())

DataFrame after dropping rows with missing values:
 Empty DataFrame
Columns: [stop_time, county_name, driver_gender, driver_age_raw, driver_age, driver_race, violation_raw, violation, search_conducted, search_type, stop_outcome, is_arrested, stop_duration, drugs_related_stop, age_category]
Index: []


### Fill missing values in a specific column with a default value

In [56]:
df_filled_na = df.fillna({'driver_age': 30})
print("DataFrame after filling missing values in 'driver_age':\n", df_filled_na.head())

DataFrame after filling missing values in 'driver_age':
            stop_time  county_name driver_gender  driver_age_raw  driver_age  \
stop_date                                                                     
2005-02-01     01:55          NaN             M          1985.0        20.0   
2005-01-04     17:30          NaN             M          1969.0        36.0   
2005-06-06     13:20          NaN             F          1986.0        19.0   
2005-07-08     16:50          NaN             M          1953.0        52.0   
2005-08-08     02:09          NaN             M          1955.0        50.0   

           driver_race            violation_raw         violation  \
stop_date                                                           
2005-02-01       White                 Speeding          Speeding   
2005-01-04       White                 Speeding          Speeding   
2005-06-06       White                 Speeding          Speeding   
2005-07-08       White  Other Traffic Violat

### Calculate the correlation between numeric columns

In [57]:
correlation_matrix = df.corr(numeric_only=True)
print("Correlation matrix between numeric columns:\n", correlation_matrix)

Correlation matrix between numeric columns:
                     county_name  driver_age_raw  driver_age  search_conducted  \
county_name                 NaN             NaN         NaN               NaN   
driver_age_raw              NaN        1.000000   -0.972259          0.016203   
driver_age                  NaN       -0.972259    1.000000         -0.054095   
search_conducted            NaN        0.016203   -0.054095          1.000000   
drugs_related_stop          NaN        0.011305   -0.043702          0.502294   

                    drugs_related_stop  
county_name                        NaN  
driver_age_raw                0.011305  
driver_age                   -0.043702  
search_conducted              0.502294  
drugs_related_stop            1.000000  


### Pivot the DataFrame

In [58]:
pivot_df = df.pivot_table(values='driver_age', index='violation', columns='driver_gender', aggfunc='mean')
print("Pivot table of driver_age by violation and driver_gender:\n", pivot_df)

Pivot table of driver_age by violation and driver_gender:
 driver_gender                F          M
violation                                
Equipment            32.056731  31.956787
Moving violation     34.102840  36.378241
Other                34.516129  40.626382
Registration/plates  32.336815  32.910417
Seat belt            29.991453  32.265760
Speeding             32.601535  33.890726
