### Pandas groupby()
### What is groupby()?

groupby() is used to split data into groups, apply a function, and combine the results (Split–Apply–Combine).

Common use cases:

• Summaries (sum, mean, count)

• Reports

• Log analysis

• Business & finance analytics

### Basic Syntax
DataFrame.groupby(
    by=None,
    axis=0,
    level=None,
    as_index=True,
    sort=True,
    group_keys=True,
    observed=False,
    dropna=True
)

#### Parameters Explained (One by One)
Defines how to group the data.

Accepts:

• Column name

• List of columns

• Dictionary

• Function

• Series

Examples:

In [None]:
import pandas as pd

data = {
    'Department': ['HR', 'Finance', 'IT', 'Finance'],
    'Gender': ['M', 'F', 'M', 'F'],
    'Salary': [40000, 60000, 55000, 70000]
}

df = pd.DataFrame(data)

In [None]:
df.groupby('Department')
df.groupby(['Department', 'Gender'])
df.groupby(df['Salary'] > 50000)
df

### 2.axis

axis=0  # group by rows (default)

axis=1  # group by columns

In [None]:
# Examples
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [10, 20, 30]
})

print("Original DataFrame:")
print(df)

# Sum along axis=0 (down the rows, column-wise)
print("\nSum with axis=0:")
print(df.sum(axis=0))

# Sum along axis=1 (across the columns, row-wise)
print("\nSum with axis=1:")
print(df.sum(axis=1))

### 3.level

Used for MultiIndex (hierarchical index).

df.groupby(level=0)

In [None]:
# Example
import pandas as pd

# Create a MultiIndex DataFrame
arrays = [
    ['A', 'A', 'B', 'B'],
    [1, 2, 1, 2]
]
index = pd.MultiIndex.from_arrays(arrays, names=('Group', 'Subgroup'))

df = pd.DataFrame({'Value': [10, 20, 30, 40]}, index=index)

print("Original DataFrame:")
print(df)

# Group by the first level of index (Group)
result = df.groupby(level=0).sum()
print("\nGrouped by level=0:")
print(result)

### 4.as_index

Controls whether grouping column becomes index.

as_index=True   # default

as_index=False  # keeps column as normal column

In [None]:
# Example:
df.groupby('Department', as_index=False).sum()

### 5.sort

Controls sorting of group keys.

sort=True   # default

sort=False  # faster, keeps original order

In [None]:
# Example
import pandas as pd

data = {
    'Department': ['HR', 'Finance', 'IT', 'Finance'],
    'Gender': ['M', 'F', 'M', 'F'],
    'Salary': [40000, 60000, 55000, 70000]
}
df = pd.DataFrame(data)

# Correct usage
print(df.groupby('Department')['Salary'].mean())

### 6.group_keys

Affects index when using apply().

group_keys=True   # adds group labels

group_keys=False  # cleaner output

In [None]:
# Example
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

print("Original DataFrame:")
print(df)

# Group by Department and take first 2 rows from each group
result = df.groupby('Department', group_keys=False).apply(lambda x: x.head(2))

print("\nResult:")
print(result)

### 7.observed

For categorical data only.

observed=False  # include all categories

observed=True   # include only observed ones

In [None]:
# Example
import pandas as pd

# Define Category as a categorical column
df = pd.DataFrame({
    'Category': pd.Categorical(['A', 'B', 'A', 'C'], categories=['A','B','C','D']),
    'Item': ['Pen', 'Book', 'Pencil', 'Notebook'],
    'Price': [10, 20, 15, 25]
})

print("Original DataFrame:")
print(df)

# Group by Category and count non-null values
result = df.groupby('Category', observed=True).count()
print("\nGrouped Result:")
print(result)

### 8.dropna

Controls whether NaN groups are included.

dropna=True   # default (ignore NaN)

dropna=False  # include NaN as group


In [None]:
# Example
import pandas as pd

# Sample DataFrame with a missing Department
df = pd.DataFrame({
    'Department': ['HR', 'Finance', None, 'Finance', 'IT', None],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Hassan', 'Zara'],
    'Salary': [40000, 60000, 50000, 70000, 55000, 45000]
})

print("Original DataFrame:")
print(df)

# Group by Department and count non-null values, keeping NaN groups
result = df.groupby('Department', dropna=False).count()
print("\nGrouped Result:")
print(result)

### Common Aggregation Functions

df.groupby('Department').sum()

df.groupby('Department').mean()

df.groupby('Department').count()

df.groupby('Department').max()

df.groupby('Department').min()

In [None]:
# Example DataFrame
import pandas as pd

df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

print("Original DataFrame:")
print(df)

In [None]:
# 1. Sum
df.groupby('Department').sum(numeric_only=True) # Adds up salaries per department.

In [None]:
# 2. Mean
df.groupby('Department').mean(numeric_only=True) # Average salary per department.

In [None]:
# 3. Count
df.groupby('Department').count()  # Counts non-null values in each column per department.

In [None]:
# 4. Max
df.groupby('Department').max() # Maximum value per column in each department.

In [None]:
# 5. Min
df.groupby('Department').min() # Minimum value per column in each department.

### Multiple Aggregations (agg())

Syntax:

df.groupby('Department').agg(['mean', 'sum'])

In [None]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000],
    'Bonus': [2000, 3000, 2500, 3500, 2200, 2700]
})

# Apply multiple aggregations
result = df.groupby('Department').agg(['mean', 'sum'])

print(result)

In [None]:
df.groupby('Department').agg(
    AvgSalary=('Salary', 'mean'),
    TotalSalary=('Salary', 'sum')
)

### apply() – Custom Logic
df.groupby('Department').apply(lambda x: x[x['Salary'] > 50000])

In [None]:
# Example
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

# Filter: keep only employees with Salary > 50000 in each Department
result = df.groupby('Department').apply(lambda x: x[x['Salary'] > 50000])

print(result)

### transform() – Preserve Shape

Returns same shape as original data.

df['Dept_Avg_Salary'] = df.groupby('Department')['Salary'].transform('mean')

In [None]:
# Example
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Department': ['HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Ali', 'Sara', 'John', 'Nasir', 'Talha', 'Hassan'],
    'Salary': [40000, 60000, 55000, 70000, 45000, 52000]
})

# Add department average salary column
df['Dept_Avg_Salary'] = df.groupby('Department')['Salary'].transform('mean')

print(df)