# Task 01 Solution: Data Pipeline with Multiple Formats

Complete solutions for building an ETL pipeline working with CSV, JSON, JSONL, and Parquet formats.

In [None]:
import pandas as pd
import numpy as np
import json
import jsonlines
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path
import time
import os

## Task 1.1: Generate Sample E-commerce Transaction Data

Create 100,000 transaction records with realistic patterns.

In [None]:
# Solution: Generate synthetic transaction data
np.random.seed(42)

n_transactions = 100_000

# Generate data
df = pd.DataFrame({
    'transaction_id': range(1, n_transactions + 1),
    'user_id': np.random.randint(1000, 50000, n_transactions),
    'product_id': np.random.randint(1, 1000, n_transactions),
    'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books', 'Sports'], n_transactions),
    'amount': np.random.uniform(10, 1000, n_transactions).round(2),
    'quantity': np.random.randint(1, 10, n_transactions),
    'timestamp': pd.date_range('2024-01-01', periods=n_transactions, freq='5min'),
    'payment_method': np.random.choice(['credit_card', 'debit_card', 'paypal', 'crypto'], n_transactions),
    'country': np.random.choice(['US', 'UK', 'CA', 'DE', 'FR', 'JP'], n_transactions)
})

# Add derived columns
df['total_amount'] = df['amount'] * df['quantity']
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day

print(f"Generated {len(df):,} transactions")
print(f"\nDataFrame info:")
print(df.info())
print(f"\nFirst few rows:")
print(df.head())

# Verify
assert len(df) == 100_000, "Should have 100k transactions"
assert df['amount'].min() >= 10, "Amount should be >= 10"
assert df['amount'].max() <= 1000, "Amount should be <= 1000"
print("\n✅ Data generated successfully!")

## Task 1.2: Save to CSV and Measure Size

Export to CSV and check file size.

In [None]:
# Solution: Save to CSV
csv_path = 'transactions.csv'

start_time = time.time()
df.to_csv(csv_path, index=False)
csv_write_time = time.time() - start_time

csv_size_mb = os.path.getsize(csv_path) / (1024**2)

print(f"CSV write time: {csv_write_time:.2f}s")
print(f"CSV file size: {csv_size_mb:.2f} MB")

# Test read speed
start_time = time.time()
df_csv = pd.read_csv(csv_path)
csv_read_time = time.time() - start_time

print(f"CSV read time: {csv_read_time:.2f}s")

# Verify
assert os.path.exists(csv_path), "CSV file should exist"
assert len(df_csv) == len(df), "Should read same number of rows"
print("\n✅ CSV operations completed!")

## Task 1.3: Convert to Parquet with Different Compressions

Compare Snappy, Gzip, and Zstd compression algorithms.

In [None]:
# Solution: Benchmark different compressions
results = []

for compression in ['snappy', 'gzip', 'zstd']:
    parquet_path = f'transactions_{compression}.parquet'
    
    # Write
    start_time = time.time()
    df.to_parquet(parquet_path, compression=compression, index=False)
    write_time = time.time() - start_time
    
    # Read all columns
    start_time = time.time()
    df_read = pd.read_parquet(parquet_path)
    read_all_time = time.time() - start_time
    
    # Read subset of columns (column pruning)
    start_time = time.time()
    df_subset = pd.read_parquet(parquet_path, columns=['user_id', 'amount', 'category'])
    read_subset_time = time.time() - start_time
    
    # File size
    file_size_mb = os.path.getsize(parquet_path) / (1024**2)
    compression_ratio = csv_size_mb / file_size_mb
    
    results.append({
        'compression': compression,
        'write_time': f"{write_time:.2f}s",
        'read_all_time': f"{read_all_time:.2f}s",
        'read_subset_time': f"{read_subset_time:.2f}s",
        'file_size_mb': f"{file_size_mb:.2f} MB",
        'compression_ratio': f"{compression_ratio:.2f}x"
    })

# Display comparison
results_df = pd.DataFrame(results)
print("\nParquet Compression Benchmark:")
print(results_df.to_string(index=False))

print(f"\nCSV baseline: {csv_size_mb:.2f} MB, read time: {csv_read_time:.2f}s")

# Verify
assert len(df_read) == len(df), "Should preserve all rows"
assert len(df_subset.columns) == 3, "Subset should have 3 columns"
print("\n✅ Parquet compression benchmark completed!")

## Task 1.4: Create Partitioned Parquet Dataset

Partition by year and month for faster filtered queries.

In [None]:
# Solution: Write partitioned Parquet
partitioned_path = 'transactions_partitioned/'

# Remove existing directory if present
import shutil
if os.path.exists(partitioned_path):
    shutil.rmtree(partitioned_path)

# Write with partitioning
df.to_parquet(
    partitioned_path,
    partition_cols=['year', 'month'],
    compression='snappy',
    index=False
)

# List partition structure
print("Partition structure:")
for root, dirs, files in os.walk(partitioned_path):
    level = root.replace(partitioned_path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = ' ' * 2 * (level + 1)
    for file in files[:3]:  # Show first 3 files per directory
        print(f"{sub_indent}{file}")
    if len(files) > 3:
        print(f"{sub_indent}... and {len(files) - 3} more files")

# Test reading with filters (predicate pushdown)
start_time = time.time()
df_filtered = pd.read_parquet(
    partitioned_path,
    filters=[('year', '==', 2024), ('month', '==', 1)]
)
filter_time = time.time() - start_time

print(f"\nFiltered read (year=2024, month=1): {filter_time:.3f}s")
print(f"Rows returned: {len(df_filtered):,}")

# Verify
assert os.path.exists(partitioned_path), "Partitioned directory should exist"
assert len(df_filtered) > 0, "Should return some rows"
assert df_filtered['year'].nunique() == 1, "Should only contain year 2024"
assert df_filtered['month'].nunique() == 1, "Should only contain month 1"
print("\n✅ Partitioned Parquet created and tested!")

## Task 1.5: Work with JSONL Format

Convert transaction data to JSONL for streaming processing.

In [None]:
# Solution: Write to JSONL
jsonl_path = 'transactions.jsonl'

# Convert to JSONL (sample first 10k for reasonable file size)
df_sample = df.head(10_000).copy()
df_sample['timestamp'] = df_sample['timestamp'].astype(str)

start_time = time.time()
with jsonlines.open(jsonl_path, mode='w') as writer:
    for _, row in df_sample.iterrows():
        writer.write(row.to_dict())
jsonl_write_time = time.time() - start_time

jsonl_size_mb = os.path.getsize(jsonl_path) / (1024**2)

print(f"JSONL write time: {jsonl_write_time:.2f}s")
print(f"JSONL file size: {jsonl_size_mb:.2f} MB")

# Read JSONL in streaming fashion
start_time = time.time()
records = []
with jsonlines.open(jsonl_path) as reader:
    for obj in reader:
        records.append(obj)
        if len(records) >= 1000:  # Stream first 1000
            break
jsonl_read_time = time.time() - start_time

print(f"JSONL streaming read (1000 records): {jsonl_read_time:.3f}s")

# Show sample record
print("\nSample JSONL record:")
print(json.dumps(records[0], indent=2))

# Verify
assert os.path.exists(jsonl_path), "JSONL file should exist"
assert len(records) == 1000, "Should read 1000 records"
assert 'transaction_id' in records[0], "Should contain transaction_id"
print("\n✅ JSONL operations completed!")

## Task 1.6: Process Nested JSON

Handle hierarchical JSON data with nested structures.

In [None]:
# Solution: Create nested JSON structure
nested_data = []

for i in range(100):
    transaction = {
        'transaction_id': i + 1,
        'user': {
            'user_id': np.random.randint(1000, 5000),
            'name': f"User_{np.random.randint(1000, 5000)}",
            'email': f"user{np.random.randint(1000, 5000)}@example.com",
            'address': {
                'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston']),
                'country': np.random.choice(['US', 'UK', 'CA'])
            }
        },
        'items': [
            {
                'product_id': np.random.randint(1, 100),
                'quantity': np.random.randint(1, 5),
                'price': round(np.random.uniform(10, 500), 2)
            }
            for _ in range(np.random.randint(1, 4))
        ],
        'payment': {
            'method': np.random.choice(['credit_card', 'paypal']),
            'status': 'completed'
        },
        'timestamp': '2024-01-01T10:00:00'
    }
    nested_data.append(transaction)

# Save nested JSON
nested_json_path = 'transactions_nested.json'
with open(nested_json_path, 'w') as f:
    json.dump(nested_data, f, indent=2)

print("Sample nested transaction:")
print(json.dumps(nested_data[0], indent=2))

# Solution: Flatten nested JSON using pandas
df_nested = pd.json_normalize(
    nested_data,
    record_path='items',
    meta=[
        'transaction_id',
        ['user', 'user_id'],
        ['user', 'name'],
        ['user', 'address', 'city'],
        ['user', 'address', 'country'],
        ['payment', 'method'],
        'timestamp'
    ],
    meta_prefix='',
    record_prefix='item_'
)

print("\nFlattened DataFrame:")
print(df_nested.head())
print(f"\nColumns: {df_nested.columns.tolist()}")

# Verify
assert len(df_nested) > len(nested_data), "Should have more rows (items expanded)"
assert 'user.user_id' in df_nested.columns, "Should have flattened user columns"
assert 'item_product_id' in df_nested.columns, "Should have item columns"
print("\n✅ Nested JSON processed successfully!")

## Task 1.7: Chunked CSV to Parquet Conversion

Process large CSV files in chunks to avoid memory issues.

In [None]:
# Solution: Chunked conversion function
def csv_to_parquet_chunked(csv_path, parquet_path, chunksize=10_000):
    """
    Convert CSV to Parquet in chunks to avoid memory issues.
    
    Args:
        csv_path: Input CSV file
        parquet_path: Output Parquet file
        chunksize: Rows per chunk
    """
    writer = None
    schema = None
    total_rows = 0
    
    try:
        # Read CSV in chunks
        chunks = pd.read_csv(csv_path, chunksize=chunksize)
        
        for i, chunk in enumerate(chunks):
            # Convert chunk to Arrow Table
            table = pa.Table.from_pandas(chunk, preserve_index=False)
            
            if writer is None:
                # Initialize writer on first chunk
                schema = table.schema
                writer = pq.ParquetWriter(
                    parquet_path,
                    schema,
                    compression='snappy'
                )
            
            # Write chunk
            writer.write_table(table)
            total_rows += len(chunk)
            
            if (i + 1) % 5 == 0:
                print(f"Processed {total_rows:,} rows...")
    
    finally:
        if writer is not None:
            writer.close()
    
    return total_rows

# Test chunked conversion
chunked_parquet_path = 'transactions_chunked.parquet'

print("Converting CSV to Parquet in chunks...")
start_time = time.time()
total_rows = csv_to_parquet_chunked(csv_path, chunked_parquet_path, chunksize=20_000)
conversion_time = time.time() - start_time

print(f"\nConversion completed in {conversion_time:.2f}s")
print(f"Total rows processed: {total_rows:,}")

# Verify the output
df_verify = pd.read_parquet(chunked_parquet_path)
print(f"Rows in output file: {len(df_verify):,}")

# Verify
assert len(df_verify) == total_rows, "Should preserve all rows"
assert len(df_verify) == len(df), "Should match original data"
print("\n✅ Chunked conversion successful!")

## Task 1.8: Format Comparison Summary

Create a comprehensive comparison of all formats.

In [None]:
# Solution: Compare all formats
comparison = []

# CSV
comparison.append({
    'Format': 'CSV',
    'Size (MB)': f"{csv_size_mb:.2f}",
    'Read Time': f"{csv_read_time:.2f}s",
    'Write Time': f"{csv_write_time:.2f}s",
    'Human Readable': 'Yes',
    'Column Pruning': 'No',
    'Compression': 'No'
})

# Parquet (Snappy)
snappy_size = os.path.getsize('transactions_snappy.parquet') / (1024**2)
comparison.append({
    'Format': 'Parquet (Snappy)',
    'Size (MB)': f"{snappy_size:.2f}",
    'Read Time': results_df[results_df['compression'] == 'snappy']['read_all_time'].values[0],
    'Write Time': results_df[results_df['compression'] == 'snappy']['write_time'].values[0],
    'Human Readable': 'No',
    'Column Pruning': 'Yes',
    'Compression': 'Snappy'
})

# Parquet (Gzip)
gzip_size = os.path.getsize('transactions_gzip.parquet') / (1024**2)
comparison.append({
    'Format': 'Parquet (Gzip)',
    'Size (MB)': f"{gzip_size:.2f}",
    'Read Time': results_df[results_df['compression'] == 'gzip']['read_all_time'].values[0],
    'Write Time': results_df[results_df['compression'] == 'gzip']['write_time'].values[0],
    'Human Readable': 'No',
    'Column Pruning': 'Yes',
    'Compression': 'Gzip'
})

# JSONL
comparison.append({
    'Format': 'JSONL',
    'Size (MB)': f"{jsonl_size_mb:.2f}",
    'Read Time': f"{jsonl_read_time:.3f}s",
    'Write Time': f"{jsonl_write_time:.2f}s",
    'Human Readable': 'Yes',
    'Column Pruning': 'No',
    'Compression': 'Optional'
})

comparison_df = pd.DataFrame(comparison)
print("\n" + "="*80)
print("FILE FORMAT COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))

print("\n" + "="*80)
print("KEY FINDINGS:")
print("="*80)
print(f"1. Parquet (Snappy) is {csv_size_mb/snappy_size:.1f}x smaller than CSV")
print(f"2. Parquet (Gzip) is {csv_size_mb/gzip_size:.1f}x smaller than CSV")
print(f"3. Parquet supports column pruning (read only needed columns)")
print(f"4. JSONL is best for streaming line-by-line processing")
print(f"5. CSV is human-readable but inefficient for large data")

print("\n" + "="*80)
print("RECOMMENDATIONS:")
print("="*80)
print("• Use Parquet (Snappy) for ML training data (fast reads)")
print("• Use Parquet (Gzip) for archival/cold storage (max compression)")
print("• Use JSONL for streaming ETL pipelines")
print("• Use CSV only for small datasets or human review")
print("="*80)

print("\n✅ All tasks completed successfully!")

## Summary

This notebook demonstrated:

1. **CSV**: Simple but large and slow
2. **Parquet**: Columnar, compressed, fast for analytics
3. **Compression**: Snappy (fast) vs Gzip (small) vs Zstd (balanced)
4. **Partitioning**: Faster filtered queries via predicate pushdown
5. **JSONL**: Streaming processing for large files
6. **Nested JSON**: Flattening hierarchical data
7. **Chunked Processing**: Handle files larger than memory

**Key Takeaways:**
- Choose format based on use case (size, speed, readability)
- Parquet is best for ML workflows
- Always benchmark before committing to a format
- Use partitioning for frequently filtered columns