11. Aggregation & GroupBy

In [2]:
import pandas as pd

df = pd.DataFrame({
    "EmpID": [1,2,3,4,5,6,7,8],
    "Name": ["Aman","Riya","Kunal","Sneha","Vikram","Pooja","Rahul","Neha"],
    "Department": ["IT","HR","IT","Finance","HR","IT","Finance","HR"],
    "Salary": [50000,48000,62000,70000,45000,58000,75000,52000],
    "Experience": [1,2,4,6,3,2,7,4]
})

df


Unnamed: 0,EmpID,Name,Department,Salary,Experience
0,1,Aman,IT,50000,1
1,2,Riya,HR,48000,2
2,3,Kunal,IT,62000,4
3,4,Sneha,Finance,70000,6
4,5,Vikram,HR,45000,3
5,6,Pooja,IT,58000,2
6,7,Rahul,Finance,75000,7
7,8,Neha,HR,52000,4


In [3]:
# 1️ Average salary
df["Salary"].mean()

np.float64(57500.0)

In [4]:
# Total salary
df["Salary"].sum()

np.int64(460000)

In [5]:
# Max & Min salary
print(df["Salary"].max())
print(df["Salary"].min())

75000
45000


In [6]:
# Count employees
print(df["EmpID"].count())

8


In [7]:
# Multiple aggregation at once
df["Salary"].agg(["mean","sum","max","min"])

mean     57500.0
sum     460000.0
max      75000.0
min      45000.0
Name: Salary, dtype: float64

In [8]:
# PART 2 — GROUPBY (CORE CONCEPT)
# df.groupby("column")["target_column"].aggregation().  = syntax
# Department-wise average salary
df.groupby("Department")["Salary"].mean()

Department
Finance    72500.000000
HR         48333.333333
IT         56666.666667
Name: Salary, dtype: float64

In [12]:
# Department-wise total salary
df.groupby("Department")["Salary"].sum()

Department
Finance    145000
HR         145000
IT         170000
Name: Salary, dtype: int64

In [13]:
# Department-wise employee count
df.groupby("Department")["EmpID"].count()


Department
Finance    2
HR         3
IT         3
Name: EmpID, dtype: int64

In [17]:
# Department-wise multiple aggregations
df.groupby("Department")["Salary"].agg(["mean","max","sum","min"])

Unnamed: 0_level_0,mean,max,sum,min
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finance,72500.0,75000,145000,70000
HR,48333.333333,52000,145000,45000
IT,56666.666667,62000,170000,50000


In [19]:
# GROUPBY ON MULTIPLE COLUMNS
# Department + Experience wise avg salary
df.groupby(["Department","Experience"])["Salary"].mean()

Department  Experience
Finance     6             70000.0
            7             75000.0
HR          2             48000.0
            3             45000.0
            4             52000.0
IT          1             50000.0
            2             58000.0
            4             62000.0
Name: Salary, dtype: float64

In [22]:
# GROUPBY WITH MULTIPLE COLUMNS AGGREGATION
df.groupby("Department").agg({
    "Salary": ["mean","max"],
    "Experience": "mean"
})


Unnamed: 0_level_0,Salary,Salary,Experience
Unnamed: 0_level_1,mean,max,mean
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Finance,72500.0,75000,6.5
HR,48333.333333,52000,3.0
IT,56666.666667,62000,2.333333


In [25]:
# RESET INDEX
result = df.groupby("Department")["Salary"].mean()
result.reset_index()

Unnamed: 0,Department,Salary
0,Finance,72500.0
1,HR,48333.333333
2,IT,56666.666667


In [26]:
df

Unnamed: 0,EmpID,Name,Department,Salary,Experience
0,1,Aman,IT,50000,1
1,2,Riya,HR,48000,2
2,3,Kunal,IT,62000,4
3,4,Sneha,Finance,70000,6
4,5,Vikram,HR,45000,3
5,6,Pooja,IT,58000,2
6,7,Rahul,Finance,75000,7
7,8,Neha,HR,52000,4


In [27]:
# Department-wise average experience
df.groupby("Department")["Experience"].mean()

Department
Finance    6.500000
HR         3.000000
IT         2.333333
Name: Experience, dtype: float64

In [28]:
# Department-wise maximum salary
df.groupby("Department")["Salary"].max()

Department
Finance    75000
HR         52000
IT         62000
Name: Salary, dtype: int64

In [29]:
# Count employees in each department
df.groupby("Department")["EmpID"].count()

Department
Finance    2
HR         3
IT         3
Name: EmpID, dtype: int64

In [32]:
# Find department with highest average salary
df.groupby("Department")["Salary"].mean().idxmax()

'Finance'

In [33]:
df.groupby("Department")["Salary"].mean().idxmin()

'HR'

In [34]:
# top 2 departments
df.groupby("Department")["Salary"].mean().sort_values(ascending=False).head(2)

Department
Finance    72500.000000
IT         56666.666667
Name: Salary, dtype: float64