# Pandas Practical Exercises

This notebook contains four practical exercises demonstrating advanced Pandas techniques for data manipulation and analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Exercise 1: Data Cleaning Challenge

Load a messy dataset, perform comprehensive cleaning, handle missing values, outliers, and inconsistent data types. Create a function to automate the cleaning process for similar datasets.

In [None]:
# Create a messy dataset
df = pd.DataFrame({
    'Name': ['John', 'Jane', 'Bob', 'Alice', np.nan],
    'Age': [25, 30, 'Unknown', 35, 40],
    'Salary': ['50,000', '60,000', '55,000', '1,000,000', '45,000'],
    'Date': ['2021-01-01', '2021-01-02', '2021/01/03', '2021.01.04', '2021-01-05']
})

print("Original DataFrame:")
print(df)

def clean_dataset(df):
    # Make a copy of the dataframe
    df_clean = df.copy()
    
    # Handle missing values
    df_clean['Name'].fillna('Unknown', inplace=True)
    
    # Convert Age to numeric, replacing 'Unknown' with NaN
    df_clean['Age'] = pd.to_numeric(df_clean['Age'], errors='coerce')
    
    # Handle outliers in Age (assuming ages should be between 18 and 100)
    df_clean.loc[df_clean['Age'] < 18, 'Age'] = 18
    df_clean.loc[df_clean['Age'] > 100, 'Age'] = 100
    
    # Clean and convert Salary to numeric
    df_clean['Salary'] = df_clean['Salary'].str.replace(',', '').astype(float)
    
    # Handle outliers in Salary (assuming salaries should be between 30,000 and 500,000)
    df_clean.loc[df_clean['Salary'] < 30000, 'Salary'] = 30000
    df_clean.loc[df_clean['Salary'] > 500000, 'Salary'] = 500000
    
    # Convert Date to datetime
    df_clean['Date'] = pd.to_datetime(df_clean['Date'], format='mixed')
    
    return df_clean

# Clean the dataset
df_cleaned = clean_dataset(df)

print("\nCleaned DataFrame:")
print(df_cleaned)
print(df_cleaned.dtypes)

## Exercise 2: Advanced Data Transformation Project

Perform complex reshaping operations on a multi-dimensional dataset, implement advanced groupby operations with custom aggregation functions, and merge multiple datasets handling various join scenarios.

In [None]:
# Create sample datasets
df1 = pd.DataFrame({
    'Date': pd.date_range(start='2021-01-01', periods=10),
    'Product': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A'],
    'Sales': np.random.randint(100, 1000, 10),
    'Quantity': np.random.randint(10, 100, 10)
})

df2 = pd.DataFrame({
    'Product': ['A', 'B', 'C', 'D'],
    'Price': [100, 200, 150, 300]
})

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Reshape the data: pivot to have products as columns
df_pivot = df1.pivot(index='Date', columns='Product', values=['Sales', 'Quantity'])
print("\nPivoted DataFrame:")
print(df_pivot)

# Custom aggregation function
def sales_per_unit(x):
    return x['Sales'].sum() / x['Quantity'].sum()

# Groupby with custom aggregation
df_grouped = df1.groupby('Product').agg({
    'Sales': ['sum', 'mean'],
    'Quantity': ['sum', 'mean'],
    'Date': ['min', 'max']
}).assign(SalesPerUnit=df1.groupby('Product').apply(sales_per_unit))

print("\nGrouped DataFrame with custom aggregation:")
print(df_grouped)

# Merge datasets
df_merged = pd.merge(df1, df2, on='Product', how='outer')
df_merged['Revenue'] = df_merged['Sales'] * df_merged['Price']

print("\nMerged DataFrame:")
print(df_merged)

## Exercise 3: Time Series Analysis

Analyze a financial dataset with stock prices, implement rolling statistics and technical indicators, and perform resampling to different time frequencies handling business days.

In [None]:
# Create a sample stock price dataset
dates = pd.date_range(start='2021-01-01', end='2021-12-31', freq='B')
prices = np.random.randint(100, 200, size=len(dates)) + np.random.random(size=len(dates))
volumes = np.random.randint(1000000, 5000000, size=len(dates))

df_stock = pd.DataFrame({
    'Date': dates,
    'Price': prices,
    'Volume': volumes
})
df_stock.set_index('Date', inplace=True)

print("Stock price dataset:")
print(df_stock.head())

# Calculate rolling statistics
df_stock['MA20'] = df_stock['Price'].rolling(window=20).mean()
df_stock['MA50'] = df_stock['Price'].rolling(window=50).mean()

# Calculate technical indicator: Relative Strength Index (RSI)
def calculate_rsi(data, periods=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=periods).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=periods).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

df_stock['RSI'] = calculate_rsi(df_stock['Price'])

print("\nDataFrame with technical indicators:")
print(df_stock.head())

# Resample to monthly frequency
df_monthly = df_stock.resample('M').agg({
    'Price': 'last',
    'Volume': 'sum',
    'MA20': 'last',
    'MA50': 'last',
    'RSI': 'last'
})

print("\nMonthly resampled data:")
print(df_monthly)

# Plot the stock price with moving averages
plt.figure(figsize=(12, 6))
plt.plot(df_stock.index, df_stock['Price'], label='Price')
plt.plot(df_stock.index, df_stock['MA20'], label='20-day MA')
plt.plot(df_stock.index, df_stock['MA50'], label='50-day MA')
plt.title('Stock Price with Moving Averages')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()

## Exercise 4: Optimization Challenge

Optimize a slow Pandas operation on a large dataset, implement parallel processing for Pandas operations, and profile and compare the performance of different approaches.

In [None]:
import time
from numba import jit
import multiprocessing

# Create a large dataset
n = 10000000
df = pd.DataFrame({
    'A': np.random.randint(0, 100, n),
    'B': np.random.randint(0, 100, n),
    'C': np.random.randint(0, 100, n)
})

# Slow operation
def slow_operation(df):
    return df.apply(lambda row: row['A'] * row['B'] + row['C'], axis=1)

# Vectorized operation
def vectorized_operation(df):
    return df['A'] * df['B'] + df['C']

# Numba optimized operation
@jit(nopython=True)
def numba_operation(A, B, C):
    return A * B + C

# Parallel processing operation
def parallel_operation(df):
    num_cores = multiprocessing.cpu_count()
    df_split = np.array_split(df, num_cores)
    pool = multiprocessing.Pool(num_cores)
    results = pool.map(vectorized_operation, df_split)
    return pd.concat(results)

# Profile and compare performance
def profile_operation(operation, df, name):
    start_time = time.time()
    result = operation(df)
    end_time = time.time()
    print(f"{name} took {end_time - start_time:.2f} seconds")
    return result

# Run and profile operations
slow_result = profile_operation(slow_operation, df, "Slow operation")
vectorized_result = profile_operation(vectorized_operation, df, "Vectorized operation")
numba_result = profile_operation(lambda df: numba_operation(df['A'].values, df['B'].values, df['C'].values), df, "Numba operation")
parallel_result = profile_operation(parallel_operation, df, "Parallel operation")

# Verify results are the same
print(f"\nAll results are equal: {np.allclose(slow_result, vectorized_result) and np.allclose(slow_result, numba_result) and np.allclose(slow_result, parallel_result)}")

These exercises demonstrate advanced Pandas techniques for data cleaning, transformation, time series analysis, and performance optimization. They provide practical experience in handling real-world data challenges and improving code efficiency.