# Advanced Pandas Operations - Exercises and Solutions

This notebook contains exercises to reinforce your understanding of advanced Pandas operations, along with their solutions.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

## Exercise 1: Complex Filtering Operations

Create a DataFrame with at least 1000 rows and 5 columns of various data types. Perform complex filtering operations using boolean indexing and .loc/.iloc.

In [2]:
# Create the DataFrame
df = pd.DataFrame({
    'A': np.random.randint(1, 100, 1000),
    'B': np.random.choice(['X', 'Y', 'Z'], 1000),
    'C': np.random.randn(1000),
    'D': pd.date_range(start='2023-01-01', periods=1000),
    'E': np.random.choice(['True', 'False'], 1000)
})

# Display the first few rows
print(df.head())

# Complex filtering using boolean indexing
filtered_df = df[(df['A'] > 50) & (df['B'] == 'X') & (df['C'] > 0) & (df['D'].dt.month == 1)]
print("\nFiltered DataFrame:")
print(filtered_df)

# Using .loc for label-based indexing
loc_filtered = df.loc[(df['A'] > 75) & (df['E'] == 'True'), ['B', 'C', 'D']]
print("\nFiltered using .loc:")
print(loc_filtered)

# Using .iloc for integer-based indexing
iloc_filtered = df.iloc[10:20, [0, 2, 4]]
print("\nFiltered using .iloc:")
print(iloc_filtered)

    A  B         C          D      E
0  52  Y  0.651409 2023-01-01   True
1  93  Z  1.976544 2023-01-02   True
2  15  Z  1.351478 2023-01-03  False
3  72  X  0.464135 2023-01-04  False
4  61  Y -0.579701 2023-01-05  False

Filtered DataFrame:
      A  B         C          D      E
3    72  X  0.464135 2023-01-04  False
8    75  X  0.650282 2023-01-09  False
374  53  X  0.816117 2024-01-10   True
379  63  X  0.553833 2024-01-15   True
390  67  X  0.808929 2024-01-26   True
759  63  X  0.241048 2025-01-29   True

Filtered using .loc:
     B         C          D
1    Z  1.976544 2023-01-02
24   Z -1.344685 2023-01-25
27   Z  0.351755 2023-01-28
29   Y  0.502790 2023-01-30
32   Y  0.873783 2023-02-02
..  ..       ...        ...
942  Y -0.545446 2025-07-31
954  X -0.083601 2025-08-12
956  Y  2.673511 2025-08-14
957  X  0.956834 2025-08-15
978  Y  1.084527 2025-09-05

[112 rows x 3 columns]

Filtered using .iloc:
     A         C      E
10  88 -0.298303  False
11  24  2.163656   True
12   3 

## Exercise 2: Creating New Columns and Applying Functions

Using the same DataFrame, create new columns based on complex conditions and apply custom functions using apply() and applymap().

In [3]:
# Create a new column based on complex conditions
df['F'] = np.where((df['A'] > 50) & (df['B'] == 'X'), 'High', 
                   np.where((df['A'] > 25) & (df['B'] == 'Y'), 'Medium', 'Low'))

# Custom function to apply to a column
def categorize_c(value):
    if value < -1:
        return 'Very Low'
    elif -1 <= value < 0:
        return 'Low'
    elif 0 <= value < 1:
        return 'Medium'
    else:
        return 'High'

# Apply the custom function to column C
df['G'] = df['C'].apply(categorize_c)

# Use applymap to apply a function to every element
df_formatted = df.applymap(lambda x: f"{x:.2f}" if isinstance(x, float) else str(x))

print(df.head())
print("\nFormatted DataFrame:")
print(df_formatted.head())

    A  B         C          D      E       F       G
0  52  Y  0.651409 2023-01-01   True  Medium  Medium
1  93  Z  1.976544 2023-01-02   True     Low    High
2  15  Z  1.351478 2023-01-03  False     Low    High
3  72  X  0.464135 2023-01-04  False    High  Medium
4  61  Y -0.579701 2023-01-05  False  Medium     Low

Formatted DataFrame:
    A  B      C                    D      E       F       G
0  52  Y   0.65  2023-01-01 00:00:00   True  Medium  Medium
1  93  Z   1.98  2023-01-02 00:00:00   True     Low    High
2  15  Z   1.35  2023-01-03 00:00:00  False     Low    High
3  72  X   0.46  2023-01-04 00:00:00  False    High  Medium
4  61  Y  -0.58  2023-01-05 00:00:00  False  Medium     Low


  df_formatted = df.applymap(lambda x: f"{x:.2f}" if isinstance(x, float) else str(x))


## Exercise 3: Advanced GroupBy Operations

Perform advanced groupby operations with multiple columns and custom aggregation functions.

In [4]:
# Group by multiple columns
grouped = df.groupby(['B', 'F'])

# Custom aggregation function
def custom_agg(x):
    return pd.Series({
        'A_mean': x['A'].mean(),
        'C_median': x['C'].median(),
        'E_mode': x['E'].mode().iloc[0],
        'count': len(x)
    })

# Apply custom aggregation
result = grouped.apply(custom_agg)
print(result)

             A_mean  C_median E_mode  count
B F                                        
X High    74.720930 -0.057768  False    172
  Low     25.369697 -0.004699   True    165
Y Low     12.103448 -0.027158  False     87
  Medium  63.585366  0.178578   True    246
Z Low     47.960606  0.068112   True    330


## Exercise 4: Time Series Operations

Create a time series DataFrame and perform resampling, rolling window calculations, and time-based indexing operations.

In [5]:
# Create a time series DataFrame
dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
ts_df = pd.DataFrame({
    'date': dates,
    'value': np.random.randn(len(dates))
})
ts_df.set_index('date', inplace=True)

# Resample to monthly frequency
monthly = ts_df.resample('M').mean()
print("Monthly resampled data:")
print(monthly)

# Calculate 7-day rolling average
ts_df['7d_rolling_avg'] = ts_df['value'].rolling(window=7).mean()

# Time-based indexing
q1_data = ts_df['2023-01-01':'2023-03-31']
print("\nQ1 2023 data:")
print(q1_data)

# Plot the original data and the rolling average
plt.figure(figsize=(12, 6))
plt.plot(ts_df.index, ts_df['value'], label='Original')
plt.plot(ts_df.index, ts_df['7d_rolling_avg'], label='7-day Rolling Avg')
plt.legend()
plt.title('Time Series Data with Rolling Average')
plt.show()

Monthly resampled data:
               value
date                
2023-01-31  0.241367
2023-02-28 -0.082185
2023-03-31  0.043465
2023-04-30 -0.169617
2023-05-31 -0.223053
2023-06-30  0.278560
2023-07-31 -0.187679
2023-08-31  0.112012
2023-09-30  0.165463
2023-10-31  0.183575
2023-11-30 -0.091846
2023-12-31  0.420903

Q1 2023 data:
               value  7d_rolling_avg
date                                
2023-01-01  1.765709             NaN
2023-01-02 -0.627883             NaN
2023-01-03 -0.885954             NaN
2023-01-04 -0.091174             NaN
2023-01-05  2.045817             NaN
...              ...             ...
2023-03-27 -0.158617        0.016906
2023-03-28  0.042483       -0.209331
2023-03-29 -1.003891       -0.285148
2023-03-30  0.856059       -0.158487
2023-03-31  0.957790        0.075619

[90 rows x 2 columns]


## Exercise 5: Merging DataFrames

Merge multiple DataFrames using different join types and handle cases with missing data.

In [6]:
# Create sample DataFrames
df1 = pd.DataFrame({
    'id': range(1, 6),
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, 35, 40, 45]
})

df2 = pd.DataFrame({
    'id': range(3, 8),
    'city': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],
    'salary': [50000, 60000, 70000, 80000, 90000]
})

# Inner join
inner_join = pd.merge(df1, df2, on='id', how='inner')
print("Inner Join:")
print(inner_join)

# Outer join
outer_join = pd.merge(df1, df2, on='id', how='outer')
print("\nOuter Join:")
print(outer_join)

# Left join
left_join = pd.merge(df1, df2, on='id', how='left')
print("\nLeft Join:")
print(left_join)

# Handle missing data
filled_join = outer_join.fillna({'city': 'Unknown', 'salary': 0})
print("\nFilled Join:")
print(filled_join)

Inner Join:
   id     name  age      city  salary
0   3  Charlie   35  New York   50000
1   4    David   40    London   60000
2   5      Eve   45     Paris   70000

Outer Join:
   id     name   age      city   salary
0   1    Alice  25.0       NaN      NaN
1   2      Bob  30.0       NaN      NaN
2   3  Charlie  35.0  New York  50000.0
3   4    David  40.0    London  60000.0
4   5      Eve  45.0     Paris  70000.0
5   6      NaN   NaN     Tokyo  80000.0
6   7      NaN   NaN    Sydney  90000.0

Left Join:
   id     name  age      city   salary
0   1    Alice   25       NaN      NaN
1   2      Bob   30       NaN      NaN
2   3  Charlie   35  New York  50000.0
3   4    David   40    London  60000.0
4   5      Eve   45     Paris  70000.0

Filled Join:
   id     name   age      city   salary
0   1    Alice  25.0   Unknown      0.0
1   2      Bob  30.0   Unknown      0.0
2   3  Charlie  35.0  New York  50000.0
3   4    David  40.0    London  60000.0
4   5      Eve  45.0     Paris  70000.0
5  

## Exercise 6: Performance Optimization

Optimize a large DataFrame (>1 million rows) using categorical data types and vectorization techniques. Compare the performance before and after optimization.

In [7]:
# Create a large DataFrame
large_df = pd.DataFrame({
    'id': np.arange(1_000_000),
    'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], 1_000_000),
    'value': np.random.randn(1_000_000)
})

# Memory usage before optimization
print("Memory usage before optimization:")
print(large_df.memory_usage(deep=True))

# Optimize using categorical data type
large_df['category'] = large_df['category'].astype('category')

# Memory usage after optimization
print("\nMemory usage after optimization:")
print(large_df.memory_usage(deep=True))

# Performance comparison for a simple operation
def slow_operation(df):
    return [x + 1 if x > 0 else x for x in df['value']]

def fast_operation(df):
    return np.where(df['value'] > 0, df['value'] + 1, df['value'])

%time _ = slow_operation(large_df)
%time _ = fast_operation(large_df)

print("\nNote the significant performance difference between the slow and fast operations.")

Memory usage before optimization:
Index             72
id           4000000
category    30000000
value        8000000
dtype: int64

Memory usage after optimization:
Index            72
id          4000000
category    1000246
value       8000000
dtype: int64
CPU times: user 1.33 s, sys: 46.7 ms, total: 1.37 s
Wall time: 1.41 s
CPU times: user 147 ms, sys: 26.7 ms, total: 173 ms
Wall time: 237 ms

Note the significant performance difference between the slow and fast operations.


These exercises cover a wide range of advanced Pandas operations and should help reinforce your understanding of the concepts. Remember to experiment with different parameters and try these techniques on your own datasets to gain more practical experience.