# Advanced Pandas Techniques

This notebook covers advanced Pandas features and techniques for more complex data manipulation and analysis.

In [1]:
import pandas as pd
import numpy as np

print("Pandas imported successfully!")
print(f"Pandas version: {pd.__version__}")

Pandas imported successfully!
Pandas version: 2.3.0


## Advanced Indexing with MultiIndex

MultiIndex allows you to have multiple levels of indexing, enabling hierarchical data structures for complex data organization.

In [3]:
# MultiIndex Examples
import pandas as pd
import numpy as np

# Create MultiIndex from arrays
arrays = [['A', 'A', 'B', 'B'], ['one', 'two', 'one', 'two']]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(4, 2), index=index, columns=['X', 'Y'])
print("DataFrame with MultiIndex:")
print(df)

# Create MultiIndex from tuples
tuples = [('A', 'one'), ('A', 'two'), ('B', 'one'), ('B', 'two')]
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df_tuples = pd.DataFrame(np.random.randn(4, 2), index=index, columns=['X', 'Y'])
print("\nDataFrame with MultiIndex from tuples:")
print(df_tuples)

# Accessing data with MultiIndex
print("\nAccessing level 0 'A':")
print(df.loc['A'])

print("\nAccessing specific combination ('A', 'one'):")
print(df.loc[('A', 'one')])

# Cross-section with xs()
print("\nCross-section for second level 'one':")
print(df.xs('one', level='second'))

# MultiIndex for columns
columns = pd.MultiIndex.from_arrays([['Math', 'Math', 'Science', 'Science'], ['Midterm', 'Final', 'Midterm', 'Final']])
df_multi_col = pd.DataFrame(np.random.randint(50, 100, (3, 4)), columns=columns)
print("\nDataFrame with MultiIndex columns:")
print(df_multi_col)

# Accessing MultiIndex columns
print("\nAccessing Math scores:")
print(df_multi_col['Math'])

print("\nAccessing Midterm scores for all subjects:")
print(df_multi_col.xs('Midterm', axis=1, level=1))

# Stacking and unstacking
stacked = df.stack()
print("\nStacked DataFrame:")
print(stacked)

unstacked = stacked.unstack()
print("\nUnstacked DataFrame:")
print(unstacked)

# Grouping with MultiIndex
df_multi = df.reset_index()
grouped = df_multi.groupby(['first', 'second']).mean()
print("\nGrouped by MultiIndex levels:")
print(grouped)

DataFrame with MultiIndex:
                     X         Y
first second                    
A     one    -0.225104  0.259974
      two     2.298644 -0.205131
B     one     0.061810  0.862524
      two    -0.453943  0.538161

DataFrame with MultiIndex from tuples:
                     X         Y
first second                    
A     one     0.855809 -0.871186
      two    -0.577840 -2.563211
B     one     0.803069  0.578024
      two     1.161659  0.377295

Accessing level 0 'A':
               X         Y
second                    
one    -0.225104  0.259974
two     2.298644 -0.205131

Accessing specific combination ('A', 'one'):
X   -0.225104
Y    0.259974
Name: (A, one), dtype: float64

Cross-section for second level 'one':
              X         Y
first                    
A     -0.225104  0.259974
B      0.061810  0.862524

DataFrame with MultiIndex columns:
     Math       Science      
  Midterm Final Midterm Final
0      74    62      89    93
1      99    81      56    93
2

## Working with Categorical Data

Categorical data types in Pandas provide efficient storage and operations for data with a limited number of possible values.

In [None]:
# Categorical Data Examples
import pandas as pd
import numpy as np

# Create DataFrame with categorical data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'],
    'Performance': ['Good', 'Excellent', 'Good', 'Average', 'Excellent'],
    'City': ['NYC', 'LA', 'Chicago', 'NYC', 'LA']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print(f"\nData types:\n{df.dtypes}")

# Convert to categorical
df['Department'] = df['Department'].astype('category')
df['Performance'] = df['Performance'].astype('category')
df['City'] = df['City'].astype('category')

print(f"\nData types after conversion:\n{df.dtypes}")
print(f"\nDepartment categories: {df['Department'].cat.categories}")
print(f"Performance categories: {df['Performance'].cat.categories}")

# Ordered categorical
performance_order = ['Poor', 'Average', 'Good', 'Excellent']
df['Performance'] = pd.Categorical(df['Performance'], categories=performance_order, ordered=True)
print(f"\nOrdered Performance categories: {df['Performance'].cat.categories}")
print(f"Is Performance ordered: {df['Performance'].cat.ordered}")

# Categorical operations
print(f"\nPerformance codes: {df['Performance'].cat.codes}")
print(f"Department value counts:\n{df['Department'].value_counts()}")

# Adding new categories
df['Department'] = df['Department'].cat.add_categories(['Marketing'])
print(f"\nDepartment categories after adding: {df['Department'].cat.categories}")

# Removing categories
df['Department'] = df['Department'].cat.remove_categories(['Finance'])
print(f"Department categories after removing: {df['Department'].cat.categories}")

# Renaming categories
df['Department'] = df['Department'].cat.rename_categories({'HR': 'Human Resources', 'IT': 'Information Technology'})
print(f"Department categories after renaming: {df['Department'].cat.categories}")

# Memory usage comparison
df_string = pd.DataFrame({
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'] * 1000,
    'Performance': ['Good', 'Excellent', 'Good', 'Average', 'Excellent'] * 1000
})

df_cat = df_string.copy()
df_cat['Department'] = df_cat['Department'].astype('category')
df_cat['Performance'] = df_cat['Performance'].astype('category')

print(f"\nMemory usage comparison:")
print(f"String columns: {df_string.memory_usage(deep=True).sum()} bytes")
print(f"Categorical columns: {df_cat.memory_usage(deep=True).sum()} bytes")
print(f"Memory saved: {df_string.memory_usage(deep=True).sum() - df_cat.memory_usage(deep=True).sum()} bytes")

# Grouping with categorical data
grouped_cat = df.groupby('Department', observed=True)['Performance'].value_counts()
print(f"\nGrouped by Department and Performance:\n{grouped_cat}")

# Sorting with ordered categorical
df_sorted = df.sort_values('Performance')
print(f"\nDataFrame sorted by Performance:\n{df_sorted[['Name', 'Performance']]}")