In [52]:
import pandas as pd
import numpy as np

In [53]:
# Set random seed for reproducibility
np.random.seed(0)

# Generate random data
data = {
    "judge ID": np.random.choice([1238,1456,2234,3345,7623,9985], size=75),
    "Crim-custody": np.random.randint(0, 3, size=75),
    "case coming for": np.random.choice(["Case A", "Case B", "Case C"], size=75),
    "date": pd.date_range(start="2023-01-01", periods=75).strftime("%Y-%m-%d")
}


In [54]:
# Create DataFrame
df = pd.DataFrame(data)

In [55]:
# Set some entries in "Crim-custody" column to missing
df.loc[np.random.choice(df.index, size=10), "Crim-custody"] = np.nan

In [56]:
df.head()

Unnamed: 0,judge ID,Crim-custody,case coming for,date
0,7623,2.0,Case C,2023-01-01
1,9985,0.0,Case B,2023-01-02
2,1238,0.0,Case A,2023-01-03
3,3345,2.0,Case A,2023-01-04
4,3345,0.0,Case A,2023-01-05


## Group by

* The `groupby()` function is used to group data based on one or more columns in a DataFrame. 
* It is part of the pandas library, which provides powerful data manipulation and analysis capabilities.
* The groupby() function allows you to split a DataFrame into groups based on a specified column or columns. 
* Once the DataFrame is grouped, you can perform various operations on each group, such as `aggregation`, `transformation`, or `filtering`.

In [57]:
# example
df.groupby(["judge ID"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002C9FDB576D0>

* Note that this alone will not display a result -- although you have split the dataset into groups, you don't have a meaningful way to display information until you chain an **Aggregation**, **transformation**, or **filtering** onto the groupby.

In [58]:
df.groupby(["judge ID"])["Crim-custody"].sum()

judge ID
1238    18.0
1456    24.0
2234     4.0
3345    10.0
7623    11.0
9985     5.0
Name: Crim-custody, dtype: float64

### Aggregation functions
There are many built-in aggregate methods provided for you in the pandas package, and you can even write and apply your own. Some of the most common aggregate methods you may want to use are:

* `.min()`: returns the minimum value for each column by group
* `.max()`: returns the maximum value for each column by group
* `.mean()`: returns the average value for each column by group
* `.median()`: returns the median value for each column by group
* `.count()`: returns the count of each column by group


In [59]:
df.groupby(["case coming for"])["Crim-custody"].sum()

case coming for
Case A    22.0
Case B    29.0
Case C    21.0
Name: Crim-custody, dtype: float64

### Multiple Groups

In [60]:
df.groupby(["judge ID", "case coming for"])["Crim-custody"].sum()

judge ID  case coming for
1238      Case A              3.0
          Case B              9.0
          Case C              6.0
1456      Case A              6.0
          Case B             11.0
          Case C              7.0
2234      Case A              3.0
          Case C              1.0
3345      Case A              2.0
          Case B              5.0
          Case C              3.0
7623      Case A              7.0
          Case B              2.0
          Case C              2.0
9985      Case A              1.0
          Case B              2.0
          Case C              2.0
Name: Crim-custody, dtype: float64

## Add More Rows and columns

* We can use pandas `concat()` function to concatenate DataFrames.

In [61]:
# Generate additional random data
additional_data = {
    "judge ID": np.random.choice([1238, 1456, 2234, 3345, 7623, 9985], size=25),
    "Crim-custody": np.random.randint(0, 3, size=25),
    "case coming for": np.random.choice(["Case A", "Case B", "Case C"], size=25),
    "date": pd.date_range(start="2023-01-06", periods=25).strftime("%Y-%m-%d")
}

# Create additional DataFrame
additional_df = pd.DataFrame(additional_data)

In [62]:
# Append additional DataFrame to the existing DataFrame
df = pd.concat([df,additional_df], axis=0, ignore_index=True)

In [63]:
df.shape

(100, 4)

In [64]:
# generate new data for columns
new_cols_data = {
    "Witness-D": np.random.choice([0, 1, 2, 3, 4, np.nan], size=100),
    "Crim-custody": np.random.choice([0, 1, 2, 3, 4, 5, 6, np.nan], size=100),
    
}

# Create dataframe for the new columns
new_cols = pd.DataFrame(new_cols_data)

In [65]:
new_cols.head()

Unnamed: 0,Witness-D,Crim-custody
0,1.0,6.0
1,2.0,1.0
2,0.0,
3,3.0,4.0
4,1.0,1.0


In [66]:
new_cols.shape

(100, 2)

In [69]:
# Concatenate the original DataFrame with the new DataFrames along the columns (axis=0)
new_df = pd.concat([df, new_cols], axis=1, ignore_index=False)

In [70]:
new_df.head()

Unnamed: 0,judge ID,Crim-custody,case coming for,date,Witness-D,Crim-custody.1
0,7623,2.0,Case C,2023-01-01,1.0,6.0
1,9985,0.0,Case B,2023-01-02,2.0,1.0
2,1238,0.0,Case A,2023-01-03,0.0,
3,3345,2.0,Case A,2023-01-04,3.0,4.0
4,3345,0.0,Case A,2023-01-05,1.0,1.0
