# Imports

In [1]:
import pandas as pd
import numpy as np

# Data

In [2]:
# Create a sample DataFrame
np.random.seed(0)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Helen', 'Ian', 'Jane'],
    'Age': np.random.randint(20, 40, 10),
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'Finance', 'HR', 'Finance', 'IT', 'Finance'],
    'Salary': np.random.randint(40000, 80000, 10),
    'Rating': np.round(np.random.uniform(1, 5, 10), 2)
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary,Rating
0,Alice,32,HR,54935,1.28
1,Bob,35,IT,55430,1.35
2,Charlie,20,Finance,79512,1.08
3,David,23,IT,54650,4.33
4,Eva,23,HR,57089,4.11
5,Frank,27,Finance,72230,4.48
6,Grace,29,HR,58983,4.91
7,Helen,39,Finance,64152,4.2
8,Ian,38,IT,75665,2.85
9,Jane,24,Finance,56921,4.12


# Basic Operations

In [5]:
# Filter data
filter_age = df[df['Age'] > 30]
# or
filter_age = df.query("Age > 30")

filter_age

Unnamed: 0,Name,Age,Department,Salary,Rating
0,Alice,32,HR,54935,1.28
1,Bob,35,IT,55430,1.35
7,Helen,39,Finance,64152,4.2
8,Ian,38,IT,75665,2.85


In [11]:
# Multiple filters with and/or
filter_multiple = df[(df['Age'] > 30) & (df['Department'] == 'IT')]
# or
filter_multiple = df.query("Age > 30 and Department == 'IT'")

filter_multiple

Unnamed: 0,Name,Age,Department,Salary,Rating
1,Bob,35,IT,55430,1.35
8,Ian,38,IT,75665,2.85


In [14]:
# Filter using .isin()
filter_isin = df[df['Department'].isin(['HR', 'Finance'])]
# or
filter_isin = df.query("Department in ('HR', 'Finance')")

filter_isin

Unnamed: 0,Name,Age,Department,Salary,Rating
0,Alice,32,HR,54935,1.28
2,Charlie,20,Finance,79512,1.08
4,Eva,23,HR,57089,4.11
5,Frank,27,Finance,72230,4.48
6,Grace,29,HR,58983,4.91
7,Helen,39,Finance,64152,4.2
9,Jane,24,Finance,56921,4.12


In [15]:
# Sorting
sorted_df = df.sort_values(by='Salary', ascending=False)
sorted_df

Unnamed: 0,Name,Age,Department,Salary,Rating
2,Charlie,20,Finance,79512,1.08
8,Ian,38,IT,75665,2.85
5,Frank,27,Finance,72230,4.48
7,Helen,39,Finance,64152,4.2
6,Grace,29,HR,58983,4.91
4,Eva,23,HR,57089,4.11
9,Jane,24,Finance,56921,4.12
1,Bob,35,IT,55430,1.35
0,Alice,32,HR,54935,1.28
3,David,23,IT,54650,4.33


In [17]:
# Filter rows using index
filter_index = df.loc[[1, 3, 5]]
filter_index

Unnamed: 0,Name,Age,Department,Salary,Rating
1,Bob,35,IT,55430,1.35
3,David,23,IT,54650,4.33
5,Frank,27,Finance,72230,4.48


In [18]:
# Use of loc and iloc
loc_example = df.loc[0:3, ['Name', 'Department']]
loc_example

Unnamed: 0,Name,Department
0,Alice,HR
1,Bob,IT
2,Charlie,Finance
3,David,IT


In [19]:
iloc_example = df.iloc[0:3, 1:4]
iloc_example

Unnamed: 0,Age,Department,Salary
0,32,HR,54935
1,35,IT,55430
2,20,Finance,79512


In [20]:
# Bucket a column
df['Age_Group'] = pd.cut(df['Age'], bins=[19, 25, 30, 35, 40], labels=['20-25', '26-30', '31-35', '36-40'])
df

Unnamed: 0,Name,Age,Department,Salary,Rating,Age_Group
0,Alice,32,HR,54935,1.28,31-35
1,Bob,35,IT,55430,1.35,31-35
2,Charlie,20,Finance,79512,1.08,20-25
3,David,23,IT,54650,4.33,20-25
4,Eva,23,HR,57089,4.11,20-25
5,Frank,27,Finance,72230,4.48,26-30
6,Grace,29,HR,58983,4.91,26-30
7,Helen,39,Finance,64152,4.2,36-40
8,Ian,38,IT,75665,2.85,36-40
9,Jane,24,Finance,56921,4.12,20-25


# Groupby

In [None]:
# Group by and .agg with named aggregation
# .agg(agg_colmn_name = (colmn_name, aggregation))

grouped_named_agg = df.groupby('Department').agg(Avg_Salary=('Salary', 'mean'),
                                                 Total_Salary=('Salary', 'sum'),
                                                 Avg_Rating=('Rating', 'mean'),
                                                 Salary_75th_Quantile=('Salary', lambda x: x.quantile(0.75))).reset_index()
grouped_named_agg

Unnamed: 0,Department,Avg_Salary,Total_Salary,Avg_Rating,Salary_75th_Quantile
0,Finance,68203.75,272815,3.47,74050.5
1,HR,57002.333333,171007,3.433333,58036.0
2,IT,61915.0,185745,2.843333,65547.5


In [26]:
# Groupby and calculate mean and sum of the same column with combined multi column names
grouped_mean_sum = df.groupby('Department')['Salary'].agg([('Salary_Mean', 'mean'), ('Salary_Sum', 'sum')]).reset_index()
grouped_mean_sum

Unnamed: 0,Department,Salary_Mean,Salary_Sum
0,Finance,68203.75,272815
1,HR,57002.333333,171007
2,IT,61915.0,185745


In [31]:
# Groupby and calculate mean and sum of multiple columns
grouped_mean_sum = df.groupby('Department')[['Salary', 'Rating']].agg(['mean', 'sum']).reset_index()
grouped_mean_sum

# Flatten the multi-level columns
grouped_mean_sum.columns = ['_'.join(col).strip('_') for col in grouped_mean_sum.columns.values]

grouped_mean_sum

Unnamed: 0,Department,Salary_mean,Salary_sum,Rating_mean,Rating_sum
0,Finance,68203.75,272815,3.47,13.88
1,HR,57002.333333,171007,3.433333,10.3
2,IT,61915.0,185745,2.843333,8.53


# Groupby vs Transform

In [33]:
df = pd.DataFrame({
    'Department': ['HR', 'HR', 'IT', 'IT', 'Finance', 'Finance'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Salary': [50000, 55000, 60000, 62000, 70000, 72000]
})
df

Unnamed: 0,Department,Employee,Salary
0,HR,Alice,50000
1,HR,Bob,55000
2,IT,Charlie,60000
3,IT,David,62000
4,Finance,Eva,70000
5,Finance,Frank,72000


In [None]:
# Summarized output
df.groupby('Department')['Salary'].agg('mean').reset_index()

Unnamed: 0,Department,Salary
0,Finance,71000.0
1,HR,52500.0
2,IT,61000.0


In [None]:
# Row-wise aligned output
df['Dept_Avg_Salary'] = df.groupby('Department')['Salary'].transform('mean')
df

Unnamed: 0,Department,Employee,Salary,Dept_Avg_Salary
0,HR,Alice,50000,52500.0
1,HR,Bob,55000,52500.0
2,IT,Charlie,60000,61000.0
3,IT,David,62000,61000.0
4,Finance,Eva,70000,71000.0
5,Finance,Frank,72000,71000.0


# Joins

In [36]:
# Creating two sample DataFrames to demonstrate joins
employees = pd.DataFrame({
    'Emp_ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Dept_ID': [101, 102, 101, 103, 104]
})

departments = pd.DataFrame({
    'Dept_ID': [101, 102, 103],
    'Department': ['HR', 'IT', 'Finance']
})

In [37]:
display(employees)
display(departments)

Unnamed: 0,Emp_ID,Name,Dept_ID
0,1,Alice,101
1,2,Bob,102
2,3,Charlie,101
3,4,David,103
4,5,Eva,104


Unnamed: 0,Dept_ID,Department
0,101,HR
1,102,IT
2,103,Finance


In [38]:
# Inner Join
inner_join = pd.merge(employees, departments, on='Dept_ID', how='inner')
inner_join

Unnamed: 0,Emp_ID,Name,Dept_ID,Department
0,1,Alice,101,HR
1,2,Bob,102,IT
2,3,Charlie,101,HR
3,4,David,103,Finance


In [39]:
# Left Join
left_join = pd.merge(employees, departments, on='Dept_ID', how='left')
left_join

Unnamed: 0,Emp_ID,Name,Dept_ID,Department
0,1,Alice,101,HR
1,2,Bob,102,IT
2,3,Charlie,101,HR
3,4,David,103,Finance
4,5,Eva,104,


In [40]:
# Right Join
right_join = pd.merge(employees, departments, on='Dept_ID', how='right')
right_join

Unnamed: 0,Emp_ID,Name,Dept_ID,Department
0,1,Alice,101,HR
1,3,Charlie,101,HR
2,2,Bob,102,IT
3,4,David,103,Finance


In [41]:
# Outer Join
outer_join = pd.merge(employees, departments, on='Dept_ID', how='outer')
outer_join

Unnamed: 0,Emp_ID,Name,Dept_ID,Department
0,1,Alice,101,HR
1,3,Charlie,101,HR
2,2,Bob,102,IT
3,4,David,103,Finance
4,5,Eva,104,


In [None]:
# Sample DataFrame to demonstrate self join
mentorship = pd.DataFrame({
    'Emp_ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Mentor_ID': [None, 1, 1, 2]
})

mentorship

Unnamed: 0,Emp_ID,Name,Mentor_ID
0,1,Alice,
1,2,Bob,1.0
2,3,Charlie,1.0
3,4,David,2.0


In [44]:
# Self join to map Mentor names
self_join = pd.merge(mentorship, mentorship[['Emp_ID', 'Name']], how='left', 
                     left_on='Mentor_ID', right_on='Emp_ID',
                     suffixes=('', '_Mentor')
                     ).drop('Emp_ID_Mentor', axis=1)

self_join

Unnamed: 0,Emp_ID,Name,Mentor_ID,Name_Mentor
0,1,Alice,,
1,2,Bob,1.0,Alice
2,3,Charlie,1.0,Alice
3,4,David,2.0,Bob
