# Group By and Aggrigation

In [None]:
import numpy as np

import pandas as pd


In [3]:
data = {
    "Category": ["Electronics", "Electronics", "Clothing", "Clothing", "Groceries", "Groceries"],
    "Store": ["Store A", "Store B", "Store A", "Store C", "Store B", "Store C"],
    "Sales": [1200, 1500, 800, 600, 400, 900],
    "Quantity": [3, 4, 5, 2, 10, 15],
    "Date": pd.to_datetime(["2025-09-01", "2025-09-02", "2025-09-02", "2025-09-03", "2025-09-03", "2025-09-04"])
}

df = pd.DataFrame(data)



In [4]:
df


Unnamed: 0,Category,Store,Sales,Quantity,Date
0,Electronics,Store A,1200,3,2025-09-01
1,Electronics,Store B,1500,4,2025-09-02
2,Clothing,Store A,800,5,2025-09-02
3,Clothing,Store C,600,2,2025-09-03
4,Groceries,Store B,400,10,2025-09-03
5,Groceries,Store C,900,15,2025-09-04


In [13]:
# Group by category 
obj = df.groupby('Category')
for name,group in obj:
    print(name)
    print(group)

Clothing
   Category    Store  Sales  Quantity       Date
2  Clothing  Store A    800         5 2025-09-02
3  Clothing  Store C    600         2 2025-09-03
Electronics
      Category    Store  Sales  Quantity       Date
0  Electronics  Store A   1200         3 2025-09-01
1  Electronics  Store B   1500         4 2025-09-02
Groceries
    Category    Store  Sales  Quantity       Date
4  Groceries  Store B    400        10 2025-09-03
5  Groceries  Store C    900        15 2025-09-04


In [None]:
# group by Category and sum of Sales
df.groupby('Category')['Sales'].sum()

Category
Clothing       1400
Electronics    2700
Groceries      1300
Name: Sales, dtype: int64

In [15]:
# Group by store and calculate  the sum of the sales
df.groupby('Store')['Sales'].sum()

Store
Store A    2000
Store B    1900
Store C    1500
Name: Sales, dtype: int64

In [18]:
# Group by multiple columns
df.groupby(['Category','Store'])['Sales'].sum()

Category     Store  
Clothing     Store A     800
             Store C     600
Electronics  Store A    1200
             Store B    1500
Groceries    Store B     400
             Store C     900
Name: Sales, dtype: int64

In [21]:
df.groupby(['Category','Store'])['Sales'].mean()

Category     Store  
Clothing     Store A     800.0
             Store C     600.0
Electronics  Store A    1200.0
             Store B    1500.0
Groceries    Store B     400.0
             Store C     900.0
Name: Sales, dtype: float64

Aggregation

In [None]:
df['Sales'].mean()
'''
(mean,median,std,min,max,count,sum,var)
all return a single value
'''

'\n(mean,median,std,min,max,count,sum,var)\n\n'

In [30]:
df['Sales'].agg(['mean','median','max','min','std','var','count','sum'])

mean         900.0
median       850.0
max         1500.0
min          400.0
std          400.0
var       160000.0
count          6.0
sum         5400.0
Name: Sales, dtype: float64

### 🔹Why is mode tricky?

Mode = the most frequent value in a group.

But unlike sum or mean, the mode isn’t guaranteed to be unique.

Example: if in one group you have [2, 2, 3, 3], both 2 and 3 are modes.

That means the result could be multiple values.

In [31]:
df['Sales'].mode()

0     400
1     600
2     800
3     900
4    1200
5    1500
Name: Sales, dtype: int64

In [36]:
# Pick one mode (most common → [0]):
df.groupby("Category")["Sales"].agg(lambda x: x.mode()[0])


Category
Clothing        600
Electronics    1200
Groceries       400
Name: Sales, dtype: int64

In [37]:
# Return all modes (as a list):
df.groupby("Category")["Sales"].apply(lambda x: x.mode().tolist())

Category
Clothing         [600, 800]
Electronics    [1200, 1500]
Groceries        [400, 900]
Name: Sales, dtype: object