# Advanced Pandas Techniques

This notebook covers advanced Pandas features and techniques for more complex data manipulation and analysis.

In [18]:
import pandas as pd
import numpy as np

print("Pandas imported successfully!")
print(f"Pandas version: {pd.__version__}")

Pandas imported successfully!
Pandas version: 2.3.0


## Advanced Indexing with MultiIndex

MultiIndex allows you to have multiple levels of indexing, enabling hierarchical data structures for complex data organization.

In [19]:
# MultiIndex Examples
import pandas as pd
import numpy as np

# Create MultiIndex from arrays
arrays = [['A', 'A', 'B', 'B'], ['one', 'two', 'one', 'two']]
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(4, 2), index=index, columns=['X', 'Y'])
print("DataFrame with MultiIndex:")
print(df)

# Create MultiIndex from tuples
tuples = [('A', 'one'), ('A', 'two'), ('B', 'one'), ('B', 'two')]
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df_tuples = pd.DataFrame(np.random.randn(4, 2), index=index, columns=['X', 'Y'])
print("\nDataFrame with MultiIndex from tuples:")
print(df_tuples)

# Accessing data with MultiIndex
print("\nAccessing level 0 'A':")
print(df.loc['A'])

print("\nAccessing specific combination ('A', 'one'):")
print(df.loc[('A', 'one')])

# Cross-section with xs()
print("\nCross-section for second level 'one':")
print(df.xs('one', level='second'))

# MultiIndex for columns
columns = pd.MultiIndex.from_arrays([['Math', 'Math', 'Science', 'Science'], ['Midterm', 'Final', 'Midterm', 'Final']])
df_multi_col = pd.DataFrame(np.random.randint(50, 100, (3, 4)), columns=columns)
print("\nDataFrame with MultiIndex columns:")
print(df_multi_col)

# Accessing MultiIndex columns
print("\nAccessing Math scores:")
print(df_multi_col['Math'])

print("\nAccessing Midterm scores for all subjects:")
print(df_multi_col.xs('Midterm', axis=1, level=1))

# Stacking and unstacking
stacked = df.stack()
print("\nStacked DataFrame:")
print(stacked)

unstacked = stacked.unstack()
print("\nUnstacked DataFrame:")
print(unstacked)

# Grouping with MultiIndex
df_multi = df.reset_index()
grouped = df_multi.groupby(['first', 'second']).mean()
print("\nGrouped by MultiIndex levels:")
print(grouped)

DataFrame with MultiIndex:
                     X         Y
first second                    
A     one     0.875756 -0.249367
      two    -0.251523  1.227751
B     one     0.286285  1.903437
      two     1.302904  0.579303

DataFrame with MultiIndex from tuples:
                     X         Y
first second                    
A     one     1.256785 -1.143781
      two     0.192862  0.146536
B     one    -1.003739 -0.650799
      two     0.768055 -0.275589

Accessing level 0 'A':
               X         Y
second                    
one     0.875756 -0.249367
two    -0.251523  1.227751

Accessing specific combination ('A', 'one'):
X    0.875756
Y   -0.249367
Name: (A, one), dtype: float64

Cross-section for second level 'one':
              X         Y
first                    
A      0.875756 -0.249367
B      0.286285  1.903437

DataFrame with MultiIndex columns:
     Math       Science      
  Midterm Final Midterm Final
0      70    58      72    56
1      72    96      94    64
2

## Working with Categorical Data

Categorical data types in Pandas provide efficient storage and operations for data with a limited number of possible values.

In [20]:
# Categorical Data Examples
import pandas as pd
import numpy as np

# Create DataFrame with categorical data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'],
    'Performance': ['Good', 'Excellent', 'Good', 'Average', 'Excellent'],
    'City': ['NYC', 'LA', 'Chicago', 'NYC', 'LA']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print(f"\nData types:\n{df.dtypes}")

# Convert to categorical
df['Department'] = df['Department'].astype('category')
df['Performance'] = df['Performance'].astype('category')
df['City'] = df['City'].astype('category')

print(f"\nData types after conversion:\n{df.dtypes}")
print(f"\nDepartment categories: {df['Department'].cat.categories}")
print(f"Performance categories: {df['Performance'].cat.categories}")

# Ordered categorical
performance_order = ['Poor', 'Average', 'Good', 'Excellent']
df['Performance'] = pd.Categorical(df['Performance'], categories=performance_order, ordered=True)
print(f"\nOrdered Performance categories: {df['Performance'].cat.categories}")
print(f"Is Performance ordered: {df['Performance'].cat.ordered}")

# Categorical operations
print(f"\nPerformance codes: {df['Performance'].cat.codes}")
print(f"Department value counts:\n{df['Department'].value_counts()}")

# Adding new categories
df['Department'] = df['Department'].cat.add_categories(['Marketing'])
print(f"\nDepartment categories after adding: {df['Department'].cat.categories}")

# Removing categories
df['Department'] = df['Department'].cat.remove_categories(['Finance'])
print(f"Department categories after removing: {df['Department'].cat.categories}")

# Renaming categories
df['Department'] = df['Department'].cat.rename_categories({'HR': 'Human Resources', 'IT': 'Information Technology'})
print(f"Department categories after renaming: {df['Department'].cat.categories}")

# Memory usage comparison
df_string = pd.DataFrame({
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'] * 1000,
    'Performance': ['Good', 'Excellent', 'Good', 'Average', 'Excellent'] * 1000
})

df_cat = df_string.copy()
df_cat['Department'] = df_cat['Department'].astype('category')
df_cat['Performance'] = df_cat['Performance'].astype('category')

print(f"\nMemory usage comparison:")
print(f"String columns: {df_string.memory_usage(deep=True).sum()} bytes")
print(f"Categorical columns: {df_cat.memory_usage(deep=True).sum()} bytes")
print(f"Memory saved: {df_string.memory_usage(deep=True).sum() - df_cat.memory_usage(deep=True).sum()} bytes")

# Grouping with categorical data
grouped_cat = df.groupby('Department', observed=True)['Performance'].value_counts()
print(f"\nGrouped by Department and Performance:\n{grouped_cat}")

# Sorting with ordered categorical
df_sorted = df.sort_values('Performance')
print(f"\nDataFrame sorted by Performance:\n{df_sorted[['Name', 'Performance']]}")

Original DataFrame:
      Name Department Performance     City
0    Alice         HR        Good      NYC
1      Bob         IT   Excellent       LA
2  Charlie    Finance        Good  Chicago
3    Diana         IT     Average      NYC
4      Eve         HR   Excellent       LA

Data types:
Name           object
Department     object
Performance    object
City           object
dtype: object

Data types after conversion:
Name             object
Department     category
Performance    category
City           category
dtype: object

Department categories: Index(['Finance', 'HR', 'IT'], dtype='object')
Performance categories: Index(['Average', 'Excellent', 'Good'], dtype='object')

Ordered Performance categories: Index(['Poor', 'Average', 'Good', 'Excellent'], dtype='object')
Is Performance ordered: True

Performance codes: 0    2
1    3
2    2
3    1
4    3
dtype: int8
Department value counts:
Department
HR         2
IT         2
Finance    1
Name: count, dtype: int64

Department categories

## String/Text Operations

Pandas provides powerful string operations through the `.str` accessor, allowing vectorized string manipulations.

In [21]:
# String Operations Examples
import pandas as pd
import numpy as np

# Create DataFrame with text data
data = {
    'Name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince', 'Eve Wilson'],
    'Email': ['alice.johnson@email.com', 'bob.smith@company.org', 'charlie.brown@school.edu', 'diana.prince@hero.net', 'eve.wilson@work.com'],
    'Description': ['Data Scientist at Tech Corp', 'Software Engineer', 'Student at University', 'Project Manager', 'Marketing Specialist'],
    'Phone': ['(555) 123-4567', '555-987-6543', '(555) 456-7890', '555.234.5678', '(555) 345-6789']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Basic string operations
print("\nName lengths:")
print(df['Name'].str.len())

print("\nNames in uppercase:")
print(df['Name'].str.upper())

print("\nNames in lowercase:")
print(df['Name'].str.lower())

print("\nNames title case:")
print(df['Name'].str.title())

# Splitting strings
print("\nSplit names into first and last:")
name_split = df['Name'].str.split(' ', expand=True)
name_split.columns = ['First Name', 'Last Name']
print(name_split)

# Extracting substrings
print("\nExtract domain from email:")
print(df['Email'].str.split('@', expand=True)[1])

# Regular expressions
print("\nExtract area code from phone:")
print(df['Phone'].str.extract(r'\((\d{3})\)'))

print("\nCheck if email contains 'com':")
print(df['Email'].str.contains('com'))

print("\nReplace 'com' with 'org' in emails:")
print(df['Email'].str.replace('com', 'org'))

# String matching and filtering
print("\nNames starting with 'A' or 'B':")
print(df[df['Name'].str.startswith(('A', 'B'))])

print("\nDescriptions containing 'at':")
print(df[df['Description'].str.contains('at', case=False)])

# Cleaning text data
print("\nClean phone numbers (remove non-digits):")
print(df['Phone'].str.replace(r'\D', '', regex=True))

# String padding
print("\nPad names to length 15:")
print(df['Name'].str.pad(width=15, side='right', fillchar='-'))

# String stripping
text_with_spaces = pd.Series(['  hello  ', '  world  ', ' pandas '])
print("\nStrip whitespace:")
print(text_with_spaces.str.strip())

# Advanced operations
print("\nExtract words from description:")
print(df['Description'].str.findall(r'\b\w+\b'))

print("\nCount words in description:")
print(df['Description'].str.split().str.len())

# Working with missing text data
df_with_na = df.copy()
df_with_na.loc[2, 'Description'] = None
print("\nDataFrame with missing text:")
print(df_with_na)

print("\nFill missing descriptions:")
print(df_with_na['Description'].fillna('Not specified'))

print("\nCheck for missing text:")
print(df_with_na['Description'].isna())

Original DataFrame:
            Name                     Email                  Description  \
0  Alice Johnson   alice.johnson@email.com  Data Scientist at Tech Corp   
1      Bob Smith     bob.smith@company.org            Software Engineer   
2  Charlie Brown  charlie.brown@school.edu        Student at University   
3   Diana Prince     diana.prince@hero.net              Project Manager   
4     Eve Wilson       eve.wilson@work.com         Marketing Specialist   

            Phone  
0  (555) 123-4567  
1    555-987-6543  
2  (555) 456-7890  
3    555.234.5678  
4  (555) 345-6789  

Name lengths:
0    13
1     9
2    13
3    12
4    10
Name: Name, dtype: int64

Names in uppercase:
0    ALICE JOHNSON
1        BOB SMITH
2    CHARLIE BROWN
3     DIANA PRINCE
4       EVE WILSON
Name: Name, dtype: object

Names in lowercase:
0    alice johnson
1        bob smith
2    charlie brown
3     diana prince
4       eve wilson
Name: Name, dtype: object

Names title case:
0    Alice Johnson
1      

## Apply, Map, and Transform

Custom functions with apply/map/transform allow you to apply complex operations to pandas data structures.

In [22]:
# Apply, Map, and Transform Examples
import pandas as pd
import numpy as np

# Create sample DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500],
    'Category': ['A', 'B', 'A', 'B', 'A']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Using map() - element-wise transformation on Series
print("\nUsing map() to transform Category column:")
category_map = {'A': 'Group Alpha', 'B': 'Group Beta'}
print(df['Category'].map(category_map))

print("\nUsing map() with function:")
print(df['A'].map(lambda x: x ** 2))

# Using apply() on Series
print("\nUsing apply() on Series:")
print(df['A'].apply(lambda x: x * 10))

print("\nUsing apply() with string operations:")
print(df['Category'].apply(lambda x: x.lower()))

# Using apply() on DataFrame - axis=0 (columns)
print("\nUsing apply() on DataFrame (axis=0, columns):")
print(df[['A', 'B', 'C']].apply(lambda x: x.sum()))

# Using apply() on DataFrame - axis=1 (rows)
print("\nUsing apply() on DataFrame (axis=1, rows):")
print(df[['A', 'B', 'C']].apply(lambda x: x.sum(), axis=1))

# Using transform() on DataFrame
print("\nUsing transform() on DataFrame:")
print(df[['A', 'B', 'C']].transform(lambda x: x - x.mean()))

# More complex examples
print("\nComplex apply() example - custom function:")
def categorize_value(x):
    if x < 3:
        return 'Low'
    elif x < 5:
        return 'Medium'
    else:
        return 'High'

print(df['A'].apply(categorize_value))

# Apply with multiple columns
print("\nApply with multiple columns:")
def combine_columns(row):
    return f"{row['A']}-{row['B']}-{row['C']}"

print(df.apply(combine_columns, axis=1))

# Transform vs Apply comparison
print("\nTransform vs Apply comparison:")
print("Transform (returns same shape):")
print(df[['A', 'B']].transform(lambda x: x * 2))

print("Apply (can return different shape):")
print(df[['A', 'B']].apply(lambda x: x.sum()))

# Using applymap() for element-wise operations on entire DataFrame
print("\nUsing applymap() for element-wise operations:")
print(df[['A', 'B', 'C']].applymap(lambda x: x / 10))

# Performance comparison
print("\nPerformance comparison (using timeit-like approach):")
import time

# Create larger DataFrame for performance test
large_df = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'])

start = time.time()
result_apply = large_df.apply(lambda x: x ** 2, axis=0)
apply_time = time.time() - start

start = time.time()
result_transform = large_df.transform(lambda x: x ** 2)
transform_time = time.time() - start

print(".2f")
print(".2f")

# Practical examples
print("\nPractical examples:")

# Data normalization
print("Z-score normalization using transform:")
def zscore(x):
    return (x - x.mean()) / x.std()

normalized_df = df[['A', 'B', 'C']].transform(zscore)
print(normalized_df)

# Percentage change
print("\nPercentage change using apply:")
print(df[['A', 'B', 'C']].apply(lambda x: x.pct_change()))

# Rolling calculations with apply
print("\nRolling mean using apply:")
print(df[['A', 'B', 'C']].apply(lambda x: x.rolling(2).mean()))

# GroupBy with apply/transform
print("\nGroupBy with apply:")
grouped = df.groupby('Category')[['A', 'B', 'C']].apply(lambda x: x.sum())
print(grouped)

print("\nGroupBy with transform:")
grouped_transform = df.groupby('Category')[['A', 'B', 'C']].transform(lambda x: x - x.mean())
print(grouped_transform)

Original DataFrame:
   A   B    C Category
0  1  10  100        A
1  2  20  200        B
2  3  30  300        A
3  4  40  400        B
4  5  50  500        A

Using map() to transform Category column:
0    Group Alpha
1     Group Beta
2    Group Alpha
3     Group Beta
4    Group Alpha
Name: Category, dtype: object

Using map() with function:
0     1
1     4
2     9
3    16
4    25
Name: A, dtype: int64

Using apply() on Series:
0    10
1    20
2    30
3    40
4    50
Name: A, dtype: int64

Using apply() with string operations:
0    a
1    b
2    a
3    b
4    a
Name: Category, dtype: object

Using apply() on DataFrame (axis=0, columns):
A      15
B     150
C    1500
dtype: int64

Using apply() on DataFrame (axis=1, rows):
0    111
1    222
2    333
3    444
4    555
dtype: int64

Using transform() on DataFrame:
     A     B      C
0 -2.0 -20.0 -200.0
1 -1.0 -10.0 -100.0
2  0.0   0.0    0.0
3  1.0  10.0  100.0
4  2.0  20.0  200.0

Complex apply() example - custom function:
0       Low
1

  print(df[['A', 'B', 'C']].applymap(lambda x: x / 10))


## Memory Optimization Techniques

Memory optimization techniques help reduce memory usage and improve performance when working with large datasets.

In [None]:
# Apply, Map, and Transform Examples
import pandas as pd
import numpy as np

# Create sample DataFrame
data = {
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500],
    'Category': ['A', 'B', 'A', 'B', 'A']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Using map() - element-wise transformation on Series
print("\nUsing map() to transform Category column:")
category_map = {'A': 'Group Alpha', 'B': 'Group Beta'}
print(df['Category'].map(category_map))

print("\nUsing map() with function:")
print(df['A'].map(lambda x: x ** 2))

# Using apply() on Series
print("\nUsing apply() on Series:")
print(df['A'].apply(lambda x: x * 10))

print("\nUsing apply() with string operations:")
print(df['Category'].apply(lambda x: x.lower()))

# Using apply() on DataFrame - axis=0 (columns)
print("\nUsing apply() on DataFrame (axis=0, columns):")
print(df[['A', 'B', 'C']].apply(lambda x: x.sum()))

# Using apply() on DataFrame - axis=1 (rows)
print("\nUsing apply() on DataFrame (axis=1, rows):")
print(df[['A', 'B', 'C']].apply(lambda x: x.sum(), axis=1))

# Using transform() on DataFrame
print("\nUsing transform() on DataFrame:")
print(df[['A', 'B', 'C']].transform(lambda x: x - x.mean()))

# More complex examples
print("\nComplex apply() example - custom function:")
def categorize_value(x):
    if x < 3:
        return 'Low'
    elif x < 5:
        return 'Medium'
    else:
        return 'High'

print(df['A'].apply(categorize_value))

# Apply with multiple columns
print("\nApply with multiple columns:")
def combine_columns(row):
    return f"{row['A']}-{row['B']}-{row['C']}"

print(df.apply(combine_columns, axis=1))

# Transform vs Apply comparison
print("\nTransform vs Apply comparison:")
print("Transform (returns same shape):")
print(df[['A', 'B']].transform(lambda x: x * 2))

print("Apply (can return different shape):")
print(df[['A', 'B']].apply(lambda x: x.sum()))

# Using applymap() for element-wise operations on entire DataFrame
print("\nUsing applymap() for element-wise operations:")
print(df[['A', 'B', 'C']].applymap(lambda x: x / 10))

# Performance comparison
print("\nPerformance comparison (using timeit-like approach):")
import time

# Create larger DataFrame for performance test
large_df = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'])

start = time.time()
result_apply = large_df.apply(lambda x: x ** 2, axis=0)
apply_time = time.time() - start

start = time.time()
result_transform = large_df.transform(lambda x: x ** 2)
transform_time = time.time() - start

print(".2f")
print(".2f")

# Practical examples
print("\nPractical examples:")

# Data normalization
print("Z-score normalization using transform:")
def zscore(x):
    return (x - x.mean()) / x.std()

normalized_df = df[['A', 'B', 'C']].transform(zscore)
print(normalized_df)

# Percentage change
print("\nPercentage change using apply:")
print(df[['A', 'B', 'C']].apply(lambda x: x.pct_change()))

# Rolling calculations with apply
print("\nRolling mean using apply:")
print(df[['A', 'B', 'C']].apply(lambda x: x.rolling(2).mean()))

# GroupBy with apply/transform
print("\nGroupBy with apply:")
grouped = df.groupby('Category')[['A', 'B', 'C']].apply(lambda x: x.sum())
print(grouped)

print("\nGroupBy with transform:")
grouped_transform = df.groupby('Category')[['A', 'B', 'C']].transform(lambda x: x - x.mean())
print(grouped_transform)

In [24]:
# Memory Optimization Techniques
import pandas as pd
import numpy as np
import sys

print("Memory Optimization Techniques in Pandas")
print("=" * 50)

# Create a sample DataFrame with memory inefficiencies
print("\n1. Data Types and Memory Usage")
np.random.seed(42)
data = {
    'int_col': np.random.randint(0, 100, 100000),  # Will be int64 by default
    'float_col': np.random.random(100000),  # Will be float64 by default
    'string_col': ['category_' + str(i % 10) for i in range(100000)],  # Repetitive strings
    'bool_col': np.random.choice([True, False], 100000),
    'date_col': pd.date_range('2020-01-01', periods=100000, freq='1min')
}
df = pd.DataFrame(data)

print("Original DataFrame memory usage:")
print(f"Total memory: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
print("Column memory usage:")
for col in df.columns:
    print(f"  {col}: {df[col].memory_usage(deep=True) / 1024 / 1024:.2f} MB")

# Optimize data types
print("\nOptimizing data types:")

# Integer downcasting
df_optimized = df.copy()
df_optimized['int_col'] = pd.to_numeric(df_optimized['int_col'], downcast='integer')
print(f"int_col: {df['int_col'].dtype} -> {df_optimized['int_col'].dtype}")

# Float downcasting
df_optimized['float_col'] = pd.to_numeric(df_optimized['float_col'], downcast='float')
print(f"float_col: {df['float_col'].dtype} -> {df_optimized['float_col'].dtype}")

# Convert repetitive strings to category
df_optimized['string_col'] = df_optimized['string_col'].astype('category')
print(f"string_col: {df['string_col'].dtype} -> {df_optimized['string_col'].dtype}")

# Boolean optimization (already bool, but showing the concept)
df_optimized['bool_col'] = df_optimized['bool_col'].astype('bool')
print(f"bool_col: {df['bool_col'].dtype} -> {df_optimized['bool_col'].dtype}")

print("\nOptimized DataFrame memory usage:")
print(f"Total memory: {df_optimized.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
print("Column memory usage:")
for col in df_optimized.columns:
    print(f"  {col}: {df_optimized[col].memory_usage(deep=True) / 1024 / 1024:.2f} MB")

memory_saved = (df.memory_usage(deep=True).sum() - df_optimized.memory_usage(deep=True).sum()) / 1024 / 1024
print(".2f")

# 2. Sparse data structures
print("\n2. Sparse Data Structures")
sparse_data = pd.DataFrame({
    'mostly_zeros': [0] * 100000 + [1] * 10,
    'mostly_null': [None] * 99990 + list(range(10)) + [None] * 10  # Fixed length
})
print("Regular DataFrame memory:")
print(f"Total: {sparse_data.memory_usage(deep=True).sum() / 1024:.0f} KB")

# Convert to sparse
sparse_data_sparse = sparse_data.astype(pd.SparseDtype("float64", np.nan))
print("Sparse DataFrame memory:")
print(f"Total: {sparse_data_sparse.memory_usage(deep=True).sum() / 1024:.0f} KB")

# 3. Chunking for large files
print("\n3. Chunking Large Files")
print("Simulating chunked reading (normally used with pd.read_csv):")

# Create a large CSV-like data
large_data = pd.DataFrame({
    'A': range(10000),
    'B': np.random.randn(10000),
    'C': ['cat'] * 5000 + ['dog'] * 5000
})

# Simulate chunked processing
chunk_size = 2500
total_sum = 0
for i in range(0, len(large_data), chunk_size):
    chunk = large_data.iloc[i:i+chunk_size]
    total_sum += chunk['A'].sum()
    print(f"Processed chunk {i//chunk_size + 1}, cumulative sum: {total_sum}")

# 4. Memory-efficient operations
print("\n4. Memory-Efficient Operations")

# Instead of creating copies
df_large = pd.DataFrame(np.random.randn(10000, 5), columns=['A', 'B', 'C', 'D', 'E'])

print("Memory before operations:")
print(f"Total: {df_large.memory_usage(deep=True).sum() / 1024:.0f} KB")

# In-place operations
df_large['F'] = df_large['A'] + df_large['B']  # Creates new column
print("After adding column F:")
print(f"Total: {df_large.memory_usage(deep=True).sum() / 1024:.0f} KB")

# Using eval() for memory-efficient operations
df_eval = df_large.copy()
df_eval = df_eval.eval('G = A * B + C')
print("After eval() operation:")
print(f"Total: {df_eval.memory_usage(deep=True).sum() / 1024:.0f} KB")

# 5. Categorical data benefits
print("\n5. Categorical Data Benefits")
text_data = pd.Series(['A', 'B', 'C', 'A'] * 25000)
print("String series memory:")
print(f"Total: {text_data.memory_usage(deep=True) / 1024:.0f} KB")

categorical_data = text_data.astype('category')
print("Categorical series memory:")
print(f"Total: {categorical_data.memory_usage(deep=True) / 1024:.0f} KB")

# Operations on categorical data
print("Categorical operations:")
print("Categories:", categorical_data.cat.categories.tolist())
print("Codes:", categorical_data.cat.codes.head().tolist())

# 6. Memory monitoring
print("\n6. Memory Monitoring")
def get_memory_usage(df):
    return df.memory_usage(deep=True).sum() / 1024 / 1024

df_test = pd.DataFrame(np.random.randn(50000, 3), columns=['X', 'Y', 'Z'])
print(".2f")

# Add some operations and monitor
df_test['W'] = df_test['X'] ** 2
print(".2f")

df_test = df_test[df_test['W'] > 0]  # Filtering
print(".2f")

# 7. Best practices summary
print("\n7. Memory Optimization Best Practices:")
print("• Use appropriate data types (int8, int16, float32, etc.)")
print("• Convert repetitive strings to categorical data")
print("• Use sparse data structures for mostly empty data")
print("• Process large files in chunks")
print("• Use in-place operations when possible")
print("• Use eval() and query() for complex operations")
print("• Monitor memory usage regularly")
print("• Consider using Dask for very large datasets")

print("\nMemory optimization completed!")

Memory Optimization Techniques in Pandas

1. Data Types and Memory Usage
Original DataFrame memory usage:
Total memory: 8.01 MB
Column memory usage:
  int_col: 0.76 MB
  float_col: 0.76 MB
  string_col: 5.63 MB
  bool_col: 0.10 MB
  date_col: 0.76 MB

Optimizing data types:
int_col: int64 -> int8
float_col: float64 -> float32
string_col: object -> category
bool_col: bool -> bool

Optimized DataFrame memory usage:
Total memory: 1.43 MB
Column memory usage:
  int_col: 0.10 MB
  float_col: 0.38 MB
  string_col: 0.10 MB
  bool_col: 0.10 MB
  date_col: 0.76 MB
.2f

2. Sparse Data Structures
Regular DataFrame memory:
Total: 1563 KB
Sparse DataFrame memory:
Total: 1172 KB

3. Chunking Large Files
Simulating chunked reading (normally used with pd.read_csv):
Processed chunk 1, cumulative sum: 3123750
Processed chunk 2, cumulative sum: 12497500
Processed chunk 3, cumulative sum: 28121250
Processed chunk 4, cumulative sum: 49995000

4. Memory-Efficient Operations
Memory before operations:
Total: 

In [31]:
# Handling Large Datasets
import pandas as pd
import numpy as np
import time

print("Handling Large Datasets in Pandas")
print("=" * 40)

# 1. Chunked reading with read_csv
print("\n1. Chunked Reading with read_csv")
print("Creating sample large CSV file...")

# Create a large CSV file for demonstration
large_df = pd.DataFrame({
    'id': range(100000),
    'value': np.random.randn(100000),
    'category': np.random.choice(['A', 'B', 'C', 'D'], 100000),
    'timestamp': pd.date_range('2020-01-01', periods=100000, freq='1min')
})

# Save to CSV (normally this would be a large existing file)
csv_file = 'large_dataset.csv'
large_df.to_csv(csv_file, index=False)
print(f"Created {csv_file} with {len(large_df)} rows")

# Read in chunks
print("\nReading in chunks of 10000 rows:")
chunk_size = 10000
total_rows = 0
category_counts = {}

start_time = time.time()
for chunk in pd.read_csv(csv_file, chunksize=chunk_size):
    total_rows += len(chunk)
    # Process each chunk
    chunk_category_counts = chunk['category'].value_counts()
    for cat, count in chunk_category_counts.items():
        category_counts[cat] = category_counts.get(cat, 0) + count
    print(f"Processed chunk with {len(chunk)} rows, total so far: {total_rows}")

end_time = time.time()
print(".2f")
print("Category distribution:", category_counts)

# 2. Memory-efficient data types
print("\n2. Memory-Efficient Data Types During Reading")
print("Reading with optimized dtypes:")

# Define dtypes for memory efficiency
dtypes = {
    'id': 'int32',  # Down from int64
    'value': 'float32',  # Down from float64
    'category': 'category',  # Convert to categorical
    # Note: timestamp handled separately with parse_dates
}

df_optimized = pd.read_csv(csv_file, dtype=dtypes, parse_dates=['timestamp'])
print(f"Memory usage with optimized dtypes: {df_optimized.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# Compare with default reading
df_default = pd.read_csv(csv_file, parse_dates=['timestamp'])
print(f"Memory usage with default dtypes: {df_default.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# 3. Selective column reading
print("\n3. Selective Column Reading")
print("Reading only necessary columns:")

# Read only specific columns
columns_to_read = ['id', 'category', 'timestamp']
df_subset = pd.read_csv(csv_file, usecols=columns_to_read, dtype={'id': 'int32', 'category': 'category'}, parse_dates=['timestamp'])
print(f"Memory usage with subset of columns: {df_subset.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
print(f"Shape: {df_subset.shape}")

# 4. Filtering during reading
print("\n4. Filtering During Reading")
print("Using skiprows to filter data:")

# Skip rows based on condition (simulate filtering)
def row_filter(index):
    if index == 0:  # Keep header
        return False
    # Skip 90% of rows randomly for demonstration
    return np.random.random() > 0.1

df_filtered = pd.read_csv(csv_file, skiprows=lambda x: row_filter(x), dtype=dtypes, parse_dates=['timestamp'])
print(f"Filtered dataset shape: {df_filtered.shape}")
print(f"Memory usage: {df_filtered.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# 5. Processing large datasets with Dask (if available)
print("\n5. Introduction to Dask for Large Datasets")
try:
    import dask.dataframe as dd
    print("Dask is available - demonstrating lazy evaluation:")

    # Create Dask DataFrame
    ddf = dd.from_pandas(large_df, npartitions=4)
    print(f"Dask DataFrame with {ddf.npartitions} partitions")
    print(f"Shape: {ddf.compute().shape}")  # .compute() triggers actual computation

    # Lazy operations
    result = ddf[ddf['value'] > 0]['category'].value_counts()
    print("Lazy operation defined, computing result:")
    print(result.compute())

except ImportError:
    print("Dask not available. Install with: pip install dask")

# 6. Efficient groupby operations
print("\n6. Efficient GroupBy Operations on Large Data")
print("Using categorical data for faster groupby:")

# Ensure category is categorical
df_optimized['category'] = df_optimized['category'].astype('category')

start_time = time.time()
grouped_result = df_optimized.groupby('category')['value'].agg(['mean', 'std', 'count'])
groupby_time = time.time() - start_time
print(".2f")
print(grouped_result)

# 7. Out-of-memory operations
print("\n7. Out-of-Memory Operations")
print("Simulating operations that would normally cause memory issues:")

# Instead of loading everything at once
print("Processing in batches:")

batch_size = 25000
results = []

for i in range(0, len(df_optimized), batch_size):
    batch = df_optimized.iloc[i:i+batch_size]
    # Perform operation on batch
    batch_result = batch.groupby('category')['value'].mean()
    results.append(batch_result)

# Combine results
final_result = pd.concat(results).groupby(level=0).mean()
print("Batch processing result:")
print(final_result)

# 8. Monitoring and cleanup
print("\n8. Memory Monitoring and Cleanup")
import gc

print("Memory usage before cleanup:")
print(".2f")

# Delete large DataFrames
del large_df, df_default, df_subset, df_filtered
gc.collect()

print("Memory usage after cleanup:")
print(".2f")

# 9. Best practices for large datasets
print("\n9. Best Practices for Large Datasets:")
print("• Use chunked reading with pd.read_csv(chunksize=...)")
print("• Specify dtypes during reading to reduce memory usage")
print("• Read only necessary columns with usecols")
print("• Use categorical data for repetitive strings")
print("• Filter data early in the pipeline")
print("• Use Dask for datasets larger than RAM")
print("• Process data in batches for memory-intensive operations")
print("• Monitor memory usage and clean up unused objects")
print("• Consider using databases for very large datasets")
print("• Use feather or parquet formats for intermediate storage")

print("\nLarge dataset handling completed!")

# Clean up the CSV file
import os
os.remove(csv_file)
print(f"Cleaned up {csv_file}")

Handling Large Datasets in Pandas

1. Chunked Reading with read_csv
Creating sample large CSV file...
Created large_dataset.csv with 100000 rows

Reading in chunks of 10000 rows:
Processed chunk with 10000 rows, total so far: 10000
Processed chunk with 10000 rows, total so far: 20000
Processed chunk with 10000 rows, total so far: 30000
Processed chunk with 10000 rows, total so far: 40000
Processed chunk with 10000 rows, total so far: 50000
Processed chunk with 10000 rows, total so far: 60000
Processed chunk with 10000 rows, total so far: 70000
Processed chunk with 10000 rows, total so far: 80000
Processed chunk with 10000 rows, total so far: 90000
Processed chunk with 10000 rows, total so far: 100000
.2f
Category distribution: {'A': 24954, 'D': 24885, 'B': 25158, 'C': 25003}

2. Memory-Efficient Data Types During Reading
Reading with optimized dtypes:
Memory usage with optimized dtypes: 1.62 MB
Memory usage with default dtypes: 7.06 MB

3. Selective Column Reading
Reading only necessar

  grouped_result = df_optimized.groupby('category')['value'].agg(['mean', 'std', 'count'])
  batch_result = batch.groupby('category')['value'].mean()
  final_result = pd.concat(results).groupby(level=0).mean()


In [33]:
# Integration with Other Libraries (Scikit-learn for ML)
import pandas as pd
import numpy as np

print("Integration with Other Libraries")
print("=" * 35)

# 1. Scikit-learn Integration
print("\n1. Scikit-learn Integration")
print("Preparing data for machine learning:")

# Create sample dataset
np.random.seed(42)
data = {
    'feature1': np.random.randn(1000),
    'feature2': np.random.randn(1000) * 2,
    'feature3': np.random.randint(0, 5, 1000),
    'category': np.random.choice(['A', 'B', 'C'], 1000),
    'target': np.random.choice([0, 1], 1000)
}
df = pd.DataFrame(data)
print("Sample DataFrame:")
print(df.head())
print(f"\nShape: {df.shape}")

# Data preprocessing with pandas
print("\nData preprocessing:")

# Handle categorical variables
df_encoded = pd.get_dummies(df, columns=['category'], drop_first=True)
print("After one-hot encoding:")
print(df_encoded.head())

# Feature scaling preparation
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Separate features and target
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Standardize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)
print("\nAfter standardization:")
print(X_scaled.head())

# 2. Model training and evaluation
print("\n2. Model Training and Evaluation")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(".2f")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 3. Cross-validation with pandas
print("\n3. Cross-Validation with Pandas")
from sklearn.model_selection import cross_val_score, KFold

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='accuracy')

print("Cross-validation scores:")
for i, score in enumerate(cv_scores, 1):
    print(".2f")
print(".2f")

# 4. Feature importance analysis
print("\n4. Feature Importance Analysis")
# Get feature coefficients
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(model.coef_[0])
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("Feature importance:")
print(feature_importance)

# 5. Pipeline integration
print("\n5. Scikit-learn Pipeline with Pandas")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Create a pipeline that works directly with pandas
# Separate numeric and categorical columns
numeric_features = ['feature1', 'feature2', 'feature3']
categorical_features = ['category']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ])

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

# Prepare data for pipeline (original df without encoding)
X_pipeline = df.drop('target', axis=1)
y_pipeline = df['target']

# Split and train
X_train_pipe, X_test_pipe, y_train_pipe, y_test_pipe = train_test_split(
    X_pipeline, y_pipeline, test_size=0.2, random_state=42, stratify=y_pipeline
)

pipeline.fit(X_train_pipe, y_train_pipe)
pipeline_score = pipeline.score(X_test_pipe, y_test_pipe)
print(".2f")

# 6. Time series with pandas and sklearn
print("\n6. Time Series Analysis")
# Create time series data
dates = pd.date_range('2020-01-01', periods=365, freq='D')
ts_data = pd.DataFrame({
    'date': dates,
    'value': np.random.randn(365).cumsum() + 100,
    'feature1': np.random.randn(365),
    'feature2': np.sin(np.arange(365) * 2 * np.pi / 365) + np.random.randn(365) * 0.1
})

ts_data.set_index('date', inplace=True)
print("Time series data:")
print(ts_data.head())

# Create lag features
ts_data['value_lag1'] = ts_data['value'].shift(1)
ts_data['value_lag7'] = ts_data['value'].shift(7)
ts_data.dropna(inplace=True)

# Prepare for modeling
X_ts = ts_data[['feature1', 'feature2', 'value_lag1', 'value_lag7']]
y_ts = ts_data['value']

# Train simple model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X_train_ts, X_test_ts, y_train_ts, y_test_ts = train_test_split(
    X_ts, y_ts, test_size=0.2, shuffle=False  # Don't shuffle time series
)

ts_model = LinearRegression()
ts_model.fit(X_train_ts, y_train_ts)
y_pred_ts = ts_model.predict(X_test_ts)

mse = mean_squared_error(y_test_ts, y_pred_ts)
print(".2f")

# 7. Integration with other libraries
print("\n7. Integration with Other Libraries")

# With NumPy
print("NumPy integration:")
numpy_array = df.select_dtypes(include=[np.number]).values
print(f"Pandas DataFrame to NumPy array: {numpy_array.shape}")

# With Matplotlib/Seaborn
try:
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Create correlation plot
    plt.figure(figsize=(8, 6))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlation_matrix = df[numeric_cols].corr()

    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_plot.png', dpi=100, bbox_inches='tight')
    plt.close()
    print("Correlation plot saved as 'correlation_plot.png'")

except ImportError:
    print("Matplotlib/Seaborn not available for plotting")

# With Statsmodels
try:
    import statsmodels.api as sm

    # Add constant for intercept
    X_sm = sm.add_constant(X_scaled)
    sm_model = sm.OLS(y, X_sm).fit()

    print("Statsmodels OLS summary:")
    print(sm_model.summary().tables[1])  # Coefficients table

except ImportError:
    print("Statsmodels not available")

# 8. Best practices for ML integration
print("\n8. Best Practices for ML Integration:")
print("• Use pandas for data exploration and preprocessing")
print("• Convert categorical variables appropriately (one-hot, label encoding)")
print("• Handle missing values before ML")
print("• Scale/normalize features when necessary")
print("• Use pipelines for reproducible preprocessing")
print("• Validate models with cross-validation")
print("• Analyze feature importance")
print("• Consider time series structure for temporal data")
print("• Integrate with visualization libraries for insights")

print("\nLibrary integration completed!")

# Clean up
import os
if os.path.exists('correlation_plot.png'):
    os.remove('correlation_plot.png')
    print("Cleaned up correlation plot")

Integration with Other Libraries

1. Scikit-learn Integration
Preparing data for machine learning:
Sample DataFrame:
   feature1  feature2  feature3 category  target
0  0.496714  2.798711         2        A       1
1 -0.138264  1.849267         2        C       1
2  0.647689  0.119261         2        A       1
3  1.523030 -1.293874         3        C       1
4 -0.234153  1.396447         4        A       0

Shape: (1000, 5)

Data preprocessing:
After one-hot encoding:
   feature1  feature2  feature3  target  category_B  category_C
0  0.496714  2.798711         2       1       False       False
1 -0.138264  1.849267         2       1       False        True
2  0.647689  0.119261         2       1       False       False
3  1.523030 -1.293874         3       1       False        True
4 -0.234153  1.396447         4       0       False       False

Features shape: (1000, 5)
Target shape: (1000,)

After standardization:
   feature1  feature2  feature3  category_B  category_C
0  0.487759  