### Pandas groupby()
### What is groupby()?

groupby() is used to split data into groups, apply a function, and combine the results (Split–Apply–Combine).

Common use cases:

• Summaries (sum, mean, count)

• Reports

• Log analysis

• Business & finance analytics

### Basic Syntax
DataFrame.groupby(
    by=None,
    axis=0,
    level=None,
    as_index=True,
    sort=True,
    group_keys=True,
    observed=False,
    dropna=True
)

#### Parameters Explained (One by One)
Defines how to group the data.

Accepts:

• Column name

• List of columns

• Dictionary

• Function

• Series

Examples:

In [1]:
import pandas as pd

data = {
    'Department': ['HR', 'Finance', 'IT', 'Finance'],
    'Gender': ['M', 'F', 'M', 'F'],
    'Salary': [40000, 60000, 55000, 70000]
}

df = pd.DataFrame(data)

In [2]:
import pandas as pd

data = {
    'Dept': ['HR', 'IT', 'Finance', 'HR'],
    'Gender': ['M', 'F', 'M', 'F'],
    'Salary': [40000, 60000, 55000, 45000]
}
df = pd.DataFrame(data)

### 2.axis

axis=0  # group by rows (default)

axis=1  # group by columns

In [22]:
# Examples
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [10, 20, 30]
})

print("Original DataFrame:")
print(df)

# Sum along axis=0 (down the rows, column-wise)
print("\nSum with axis=0:")
print(df.sum(axis=0))

# Sum along axis=1 (across the columns, row-wise)
print("\nSum with axis=1:")
print(df.sum(axis=1))

Original DataFrame:
   A   B
0  1  10
1  2  20
2  3  30

Sum with axis=0:
A     6
B    60
dtype: int64

Sum with axis=1:
0    11
1    22
2    33
dtype: int64


### 3.level

Used for MultiIndex (hierarchical index).

df.groupby(level=0)

In [21]:
# Example
import pandas as pd

# Create a MultiIndex DataFrame
arrays = [
    ['A', 'A', 'B', 'B'],
    [1, 2, 1, 2]
]
index = pd.MultiIndex.from_arrays(arrays, names=('Group', 'Subgroup'))

df = pd.DataFrame({'Value': [10, 20, 30, 40]}, index=index)

print("Original DataFrame:")
print(df)

# Group by the first level of index (Group)
result = df.groupby(level=0).sum()
print("\nGrouped by level=0:")
print(result)

Original DataFrame:
                Value
Group Subgroup       
A     1            10
      2            20
B     1            30
      2            40

Grouped by level=0:
       Value
Group       
A         30
B         70


### 4.as_index

Controls whether grouping column becomes index.

as_index=True   # default

as_index=False  # keeps column as normal column

In [20]:
# Example:
df.groupby('Department', as_index=False).sum()

Unnamed: 0,Department,Gender,Salary
0,Finance,FF,130000
1,HR,M,40000
2,IT,M,55000


### 5.sort

Controls sorting of group keys.

sort=True   # default

sort=False  # faster, keeps original order

In [19]:
# Example
import pandas as pd

data = {
    'Department': ['HR', 'Finance', 'IT', 'Finance'],
    'Gender': ['M', 'F', 'M', 'F'],
    'Salary': [40000, 60000, 55000, 70000]
}
df = pd.DataFrame(data)

# Correct usage
print(df.groupby('Department')['Salary'].mean())

Department
Finance    65000.0
HR         40000.0
IT         55000.0
Name: Salary, dtype: float64


### 6.group_keys

Affects index when using apply().

group_keys=True   # adds group labels

group_keys=False  # cleaner output

In [18]:
# Example
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

print("Original DataFrame:")
print(df)

# Group by Department and take first 2 rows from each group
result = df.groupby('Department', group_keys=False).apply(lambda x: x.head(2))

print("\nResult:")
print(result)

Original DataFrame:
  Department Employee  Salary
0         HR      Ali   40000
1    Finance     Sara   60000
2         IT     John   55000
3    Finance    Nasir   70000
4         HR    Talha   45000
5         IT   Hassan   52000

Result:
  Department Employee  Salary
0         HR      Ali   40000
1    Finance     Sara   60000
2         IT     John   55000
3    Finance    Nasir   70000
4         HR    Talha   45000
5         IT   Hassan   52000


  result = df.groupby('Department', group_keys=False).apply(lambda x: x.head(2))


### 7.observed

For categorical data only.

observed=False  # include all categories

observed=True   # include only observed ones

In [17]:
# Example
import pandas as pd

# Define Category as a categorical column
df = pd.DataFrame({
    'Category': pd.Categorical(['A', 'B', 'A', 'C'], categories=['A','B','C','D']),
    'Item': ['Pen', 'Book', 'Pencil', 'Notebook'],
    'Price': [10, 20, 15, 25]
})

print("Original DataFrame:")
print(df)

# Group by Category and count non-null values
result = df.groupby('Category', observed=True).count()
print("\nGrouped Result:")
print(result)

Original DataFrame:
  Category      Item  Price
0        A       Pen     10
1        B      Book     20
2        A    Pencil     15
3        C  Notebook     25

Grouped Result:
          Item  Price
Category             
A            2      2
B            1      1
C            1      1


### 8.dropna

Controls whether NaN groups are included.

dropna=True   # default (ignore NaN)

dropna=False  # include NaN as group


In [16]:
# Example
import pandas as pd

# Sample DataFrame with a missing Department
df = pd.DataFrame({
    'Department': ['HR', 'Finance', None, 'Finance', 'IT', None],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Hassan', 'Zara'],
    'Salary': [40000, 60000, 50000, 70000, 55000, 45000]
})

print("Original DataFrame:")
print(df)

# Group by Department and count non-null values, keeping NaN groups
result = df.groupby('Department', dropna=False).count()
print("\nGrouped Result:")
print(result)

Original DataFrame:
  Department Employee  Salary
0         HR      Ali   40000
1    Finance     Sara   60000
2       None     John   50000
3    Finance    Nasir   70000
4         IT   Hassan   55000
5       None     Zara   45000

Grouped Result:
            Employee  Salary
Department                  
Finance            2       2
HR                 1       1
IT                 1       1
NaN                2       2


### Common Aggregation Functions

df.groupby('Department').sum()

df.groupby('Department').mean()

df.groupby('Department').count()

df.groupby('Department').max()

df.groupby('Department').min()

In [15]:
# Example DataFrame
import pandas as pd

df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

print("Original DataFrame:")
print(df)

Original DataFrame:
  Department Employee  Salary
0         HR      Ali   40000
1    Finance     Sara   60000
2         IT     John   55000
3    Finance    Nasir   70000
4         HR    Talha   45000
5         IT   Hassan   52000


In [14]:
# 1. Sum
df.groupby('Department').sum(numeric_only=True) # Adds up salaries per department.

Unnamed: 0_level_0,Salary,Bonus
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,130000,6500
HR,85000,4200
IT,107000,5200


In [13]:
# 2. Mean
df.groupby('Department').mean(numeric_only=True) # Average salary per department.

Unnamed: 0_level_0,Salary,Bonus
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,65000.0,3250.0
HR,42500.0,2100.0
IT,53500.0,2600.0


In [12]:
# 3. Count
df.groupby('Department').count()  # Counts non-null values in each column per department.

Unnamed: 0_level_0,Salary,Bonus
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,2,2
HR,2,2
IT,2,2


In [11]:
# 4. Max
df.groupby('Department').max() # Maximum value per column in each department.

Unnamed: 0_level_0,Salary,Bonus
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,70000,3500
HR,45000,2200
IT,55000,2700


In [10]:
# 5. Min
df.groupby('Department').min() # Minimum value per column in each department.

Unnamed: 0_level_0,Salary,Bonus
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,60000,3000
HR,40000,2000
IT,52000,2500


### Multiple Aggregations (agg())

Syntax:

df.groupby('Department').agg(['mean', 'sum'])

In [9]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000],
    'Bonus': [2000, 3000, 2500, 3500, 2200, 2700]
})

# Apply multiple aggregations
result = df.groupby('Department').agg(['mean', 'sum'])

print(result)

             Salary           Bonus      
               mean     sum    mean   sum
Department                               
Finance     65000.0  130000  3250.0  6500
HR          42500.0   85000  2100.0  4200
IT          53500.0  107000  2600.0  5200


In [8]:
df.groupby('Department').agg(
    AvgSalary=('Salary', 'mean'),
    TotalSalary=('Salary', 'sum')
)

Unnamed: 0_level_0,AvgSalary,TotalSalary
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,65000.0,130000
HR,42500.0,85000
IT,53500.0,107000


### apply() – Custom Logic
df.groupby('Department').apply(lambda x: x[x['Salary'] > 50000])

In [6]:
# Example
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

# Filter: keep only employees with Salary > 50000 in each Department
result = df.groupby('Department').apply(lambda x: x[x['Salary'] > 50000])

print(result)

             Department Employee  Salary
Department                              
Finance    1    Finance     Sara   60000
           3    Finance    Nasir   70000
IT         2         IT     John   55000
           5         IT   Hassan   52000


  result = df.groupby('Department').apply(lambda x: x[x['Salary'] > 50000])


### transform() – Preserve Shape

Returns same shape as original data.

df['Dept_Avg_Salary'] = df.groupby('Department')['Salary'].transform('mean')

In [7]:
# Example
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

# Add department average salary column
df['Dept_Avg_Salary'] = df.groupby('Department')['Salary'].transform('mean')

print(df)

  Department Employee  Salary  Dept_Avg_Salary
0         HR      Ali   40000          42500.0
1    Finance     Sara   60000          65000.0
2         IT     John   55000          53500.0
3    Finance    Nasir   70000          65000.0
4         HR    Talha   45000          42500.0
5         IT   Hassan   52000          53500.0
