# Mastering Pandas for Data Manipulation and Analysis

## Introduction

Pandas is a powerful library for data manipulation and analysis in Python. It provides high-performance, easy-to-use data structures and data analysis tools. In this lecture, we'll dive deep into Pandas, covering everything from basic concepts to advanced techniques.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

## 1. Pandas Fundamentals

### 1.1 Series and DataFrame objects

The two primary data structures in Pandas are Series and DataFrame.

In [None]:
# Creating a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print("Series:")
print(s)

# Creating a DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': pd.Timestamp('20230101'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})
print("\nDataFrame:")
print(df)

### 1.2 Creating, reading, and writing DataFrames

In [None]:
# Creating a DataFrame from a dictionary
df = pd.DataFrame({
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 32],
    'City': ['New York', 'Paris', 'Berlin', 'London']
})

# Writing to CSV
df.to_csv('people.csv', index=False)

# Reading from CSV
df_csv = pd.read_csv('people.csv')
print("DataFrame from CSV:")
print(df_csv)

# Writing to Excel
df.to_excel('people.xlsx', index=False)

# Reading from Excel
df_excel = pd.read_excel('people.xlsx')
print("\nDataFrame from Excel:")
print(df_excel)

# Note: For SQL databases, you would typically use pd.read_sql() and df.to_sql()

### 1.3 Understanding and manipulating index objects

In [None]:
# Creating a DataFrame with a custom index
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': ['a', 'b', 'c', 'd']
}, index=['w', 'x', 'y', 'z'])
print("DataFrame with custom index:")
print(df)

# Accessing the index
print("\nIndex:")
print(df.index)

# Setting a new index
df.set_index('B', inplace=True)
print("\nDataFrame with 'B' as index:")
print(df)

# Resetting the index
df.reset_index(inplace=True)
print("\nDataFrame with reset index:")
print(df)

## 2. Data Cleaning and Preprocessing

### 2.1 Handling missing data

In [None]:
# Create a DataFrame with missing values
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
})
print("Original DataFrame:")
print(df)

# Filling NA values
print("\nFilling NA with 0:")
print(df.fillna(0))

# Dropping NA values
print("\nDropping rows with NA:")
print(df.dropna())

# Interpolating values
print("\nInterpolating NA values:")
print(df.interpolate())

### 2.2 Removing duplicates and handling outliers

In [None]:
# Create a DataFrame with duplicates and outliers
df = pd.DataFrame({
    'A': [1, 2, 2, 3, 1000],  # 1000 is an outlier
    'B': ['a', 'b', 'b', 'c', 'd']
})
print("Original DataFrame:")
print(df)

# Remove duplicates
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame without duplicates:")
print(df_no_duplicates)

# Handle outliers (here, we'll remove rows where 'A' is more than 3 standard deviations from the mean)
mean = df['A'].mean()
std = df['A'].std()
df_no_outliers = df[(df['A'] - mean).abs() <= 3 * std]
print("\nDataFrame without outliers:")
print(df_no_outliers)

### 2.3 Data type conversion and category encoding

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'A': ['1', '2', '3', '4'],
    'B': ['low', 'medium', 'high', 'low'],
    'C': ['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04']
})
print("Original DataFrame:")
print(df.dtypes)

# Convert 'A' to integer
df['A'] = df['A'].astype(int)

# Convert 'B' to category
df['B'] = df['B'].astype('category')

# Convert 'C' to datetime
df['C'] = pd.to_datetime(df['C'])

print("\nDataFrame after type conversion:")
print(df.dtypes)

# Encode 'B' as numeric
df['B_encoded'] = df['B'].cat.codes
print("\nDataFrame with encoded 'B':")
print(df)

### 2.4 String manipulation with Pandas

In [None]:
# Create a DataFrame with string data
df = pd.DataFrame({
    'Name': ['John Smith', 'Jane Doe', 'Mike Johnson'],
    'Email': ['john@example.com', 'jane@example.com', 'mike@example.com']
})
print("Original DataFrame:")
print(df)

# Extract first name
df['First Name'] = df['Name'].str.split().str[0]

# Extract domain from email
df['Domain'] = df['Email'].str.split('@').str[1]

# Convert name to uppercase
df['Name Upper'] = df['Name'].str.upper()

print("\nDataFrame after string operations:")
print(df)

## 3. Advanced Data Transformation

### 3.1 Reshaping data: pivot, melt, stack, and unstack

In [None]:
# Create a sample DataFrame
df = pd.DataFrame({
    'Date': ['2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02'],
    'Product': ['A', 'B', 'A', 'B'],
    'Sales': [100, 150, 120, 180]
})
print("Original DataFrame:")
print(df)

# Pivot
df_pivot = df.pivot(index='Date', columns='Product', values='Sales')
print("\nPivoted DataFrame:")
print(df_pivot)

# Melt
df_melt = pd.melt(df_pivot.reset_index(), id_vars=['Date'], var_name='Product', value_name='Sales')
print("\nMelted DataFrame:")
print(df_melt)

# Stack
df_stack = df_pivot.stack()
print("\nStacked DataFrame:")
print(df_stack)

# Unstack
df_unstack = df_stack.unstack()
print("\nUnstacked DataFrame:")
print(df_unstack)

### 3.2 Grouping and aggregation with groupby

In [None]:
# Create a sample DataFrame
df = pd.DataFrame({
    'Category': ['A', 'B', 'A', 'B', 'A', 'B'],
    'Value1': [10, 20, 30, 40, 50, 60],
    'Value2': [100, 200, 300, 400, 500, 600]
})
print("Original DataFrame:")
print(df)

# Group by Category and calculate mean
grouped_mean = df.groupby('Category').mean()
print("\nGrouped Mean:")
print(grouped_mean)

# Group by Category and apply multiple aggregations
grouped_agg = df.groupby('Category').agg({
    'Value1': ['mean', 'sum'],
    'Value2': ['min', 'max']
})
print("\nGrouped with multiple aggregations:")
print(grouped_agg)

# Custom aggregation function
def range_func(x):
    return x.max() - x.min()

grouped_custom = df.groupby('Category').agg({
    'Value1': range_func,
    'Value2': range_func
})
print("\nGrouped with custom aggregation:")
print(grouped_custom)

### 3.3 Applying functions with apply, applymap, and map

In [None]:
# Create a sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [10, 20, 30],
    'C': ['a', 'b', 'c']
})
print("Original DataFrame:")
print(df)

# Using apply on a single column
df['A_squared'] = df['A'].apply(lambda x: x**2)
print("\nAfter applying square function to column A:")
print(df)

# Using apply on the entire DataFrame
df_applied = df.apply(lambda x: x.max() if x.dtype == 'int64' else x, axis=0)
print("\nAfter applying max function to numeric columns:")
print(df_applied)

# Using applymap on the entire DataFrame
df_applymap = df.applymap(lambda x: str(x).upper() if isinstance(x, str) else x)
print("\nAfter applying uppercase to string values:")
print(df_applymap)

# Using map on a single column
mapping = {'a': 'Apple', 'b': 'Banana', 'c': 'Cherry'}
df['C_mapped'] = df['C'].map(mapping)
print("\nAfter mapping values in column C:")
print(df)

### 3.4 Merging, joining, and concatenating DataFrames

In [None]:
# Create sample DataFrames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2'])
df2 = pd.DataFrame({'C': ['C0', 'C1', 'C2'], 'D': ['D0', 'D1', 'D2']}, index=['K0', 'K2', 'K3'])
df3 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2'])

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)
print("\nDataFrame 3:")
print(df3)

# Merging DataFrames
merged = pd.merge(df1, df2, left_index=True, right_index=True, how='outer')
print("\nMerged DataFrame:")
print(merged)

# Joining DataFrames
joined = df1.join(df2, how='outer')
print("\nJoined DataFrame:")
print(joined)

# Concatenating DataFrames
concatenated = pd.concat([df1, df3])
print(concatenated)

## 4. Advanced Indexing and Selection

### 4.1 Boolean indexing and masking

In [None]:
# Create a sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': ['a', 'b', 'c', 'd', 'e']
})
print("Original DataFrame:")
print(df)

# Boolean indexing
mask = df['A'] > 2
print("\nRows where A > 2:")
print(df[mask])

# Multiple conditions
mask = (df['A'] > 2) & (df['B'] < 50)
print("\nRows where A > 2 and B < 50:")
print(df[mask])

# Boolean indexing with isin
mask = df['C'].isin(['a', 'c', 'e'])
print("\nRows where C is 'a', 'c', or 'e':")
print(df[mask])

### 4.2 Hierarchical indexing (MultiIndex)

In [None]:
# Create a DataFrame with MultiIndex
arrays = [
    ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
    ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
]
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df.index.names = ['first', 'second']
print("DataFrame with MultiIndex:")
print(df)

# Selecting using MultiIndex
print("\nSelecting 'bar' from first level:")
print(df.loc['bar'])

print("\nSelecting 'bar' and 'one' from first and second levels:")
print(df.loc[('bar', 'one')])

# Cross-section selection
print("\nCross-section selection for second level 'one':")
print(df.xs('one', level='second'))

### 4.3 Advanced loc and iloc usage

In [None]:
# Create a sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': ['a', 'b', 'c', 'd', 'e']
}, index=['row1', 'row2', 'row3', 'row4', 'row5'])
print("Original DataFrame:")
print(df)

# Using loc
print("\nSelecting rows 'row2' to 'row4' and columns 'A' and 'C' using loc:")
print(df.loc['row2':'row4', ['A', 'C']])

# Using iloc
print("\nSelecting rows 1 to 3 and columns 0 and 2 using iloc:")
print(df.iloc[1:4, [0, 2]])

# Mixing loc and iloc
print("\nMixing loc and iloc:")
print(df.iloc[1:4].loc[:, ['A', 'C']])

## 5. Time Series Analysis

### 5.1 Working with datetime data

In [None]:
# Create a DataFrame with datetime index
dates = pd.date_range('20210101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print("DataFrame with datetime index:")
print(df)

# Selecting specific dates
print("\nSelecting a specific date:")
print(df.loc['2021-01-03'])

# Selecting date ranges
print("\nSelecting a date range:")
print(df['2021-01-02':'2021-01-04'])

# Date arithmetic
print("\nAdding 2 days to the index:")
print(df.index + pd.Timedelta(days=2))

### 5.2 Resampling and rolling window calculations

In [None]:
# Create a time series DataFrame
dates = pd.date_range('20210101', periods=100, freq='D')
ts = pd.Series(np.random.randn(len(dates)), index=dates)
print("Original time series:")
print(ts.head())

# Resampling to monthly frequency
monthly = ts.resample('M').mean()
print("\nMonthly resampled data:")
print(monthly)

# Rolling window calculations
rolling = ts.rolling(window=7).mean()
print("\n7-day rolling average:")
print(rolling.head(10))

### 5.3 Handling time zones and periods

In [None]:
# Create a time series with time zone
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D', tz='US/Eastern')
ts = pd.Series(np.random.randn(len(rng)), rng)
print("Time series with time zone:")
print(ts)

# Convert to another time zone
print("\nConverted to UTC:")
print(ts.tz_convert('UTC'))

# Create a period index
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
print("\nTime series with period index:")
print(ts.head())

## 6. Performance Optimization

### 6.1 Using categorical data types

In [None]:
# Create a DataFrame with categorical data
df = pd.DataFrame({
    'id': range(1000000),
    'value': np.random.randint(0, 100, size=1000000),
    'category': np.random.choice(['A', 'B', 'C', 'D'], size=1000000)
})

print("Memory usage before optimization:")
print(df.memory_usage(deep=True))

# Convert 'category' column to categorical type
df['category'] = df['category'].astype('category')

print("\nMemory usage after optimization:")
print(df.memory_usage(deep=True))

### 6.2 Efficient iteration with itertuples and iterrows

In [None]:
import time

# Create a sample DataFrame
df = pd.DataFrame({
    'A': range(10000),
    'B': range(10000, 20000),
    'C': range(20000, 30000)
})

# Using iterrows
start = time.time()
for index, row in df.iterrows():
    _ = row['A'] + row['B'] + row['C']
print(f"Time taken with iterrows: {time.time() - start:.4f} seconds")

# Using itertuples
start = time.time()
for row in df.itertuples(index=False):
    _ = row.A + row.B + row.C
print(f"Time taken with itertuples: {time.time() - start:.4f} seconds")

### 6.3 Vectorization techniques

In [None]:
# Create a sample DataFrame
df = pd.DataFrame({
    'A': np.random.randn(1000000),
    'B': np.random.randn(1000000)
})

# Non-vectorized operation
start = time.time()
result = []
for i in range(len(df)):
    result.append(df.iloc[i]['A'] + df.iloc[i]['B'])
print(f"Time taken with loop: {time.time() - start:.4f} seconds")

# Vectorized operation
start = time.time()
result = df['A'] + df['B']
print(f"Time taken with vectorization: {time.time() - start:.4f} seconds")

### 6.4 Using numba with Pandas for performance boost

In [None]:
from numba import jit
import numpy as np

@jit(nopython=True)
def add_columns_numba(A, B):
    return A + B

# Create a sample DataFrame
df = pd.DataFrame({
    'A': np.random.randn(1000000),
    'B': np.random.randn(1000000)
})

# Pandas operation
start = time.time()
result_pandas = df['A'] + df['B']
print(f"Time taken with Pandas: {time.time() - start:.4f} seconds")

# Numba operation
start = time.time()
result_numba = add_columns_numba(df['A'].values, df['B'].values)
print(f"Time taken with Numba: {time.time() - start:.4f} seconds")

# Verify results are the same
print(f"Results are equal: {np.allclose(result_pandas, result_numba)}")

## Conclusion

This comprehensive lecture has covered advanced topics in Pandas, including data manipulation, analysis, and performance optimization. By mastering these concepts, you'll be well-equipped to handle complex data tasks efficiently using Pandas.

Remember to practice these techniques with real-world datasets to solidify your understanding and improve your skills in data analysis with Python and Pandas.