[Reference](https://medium.com/@tubelwj/building-python-etl-data-pipelines-with-five-typical-cases-bcf130c27bfa)

# Cleaning and transforming CSV files

In [1]:
import pandas as pd

def clean_csv_data(input_file, output_file):

    """
    Cleans and processes a CSV file and saves the cleaned data to a new file.

    Parameters:
    - input_file: str, path to the input CSV file
    - output_file: str, path to save the cleaned CSV file
    """

    # Read the CSV file, skipping any malformed rows
    df = pd.read_csv(input_file, on_bad_lines='skip')

    # Remove duplicate rows
    df = df.drop_duplicates()

    # Fill missing values with appropriate defaults
    df = df.fillna({
        'total_price': df['total_price'].mean(),   # Fill missing 'total_price' with the mean value
        'product': 'ordinary product',              # Fill missing 'product' with 'ordinary product'
        'address': 'unknown'         # Fill missing 'city' with 'unknown'
    })

    # Filter out rows with price values outside the valid range (0 to 1000)
    df = df[df['total_price'].between(0, 1000)]

    # Save the cleaned data to the specified output file
    df.to_csv(output_file, index=False)

# Integrating Data from Multiple Sources

In [2]:
import requests
import pandas as pd

def integrate_multiple_sources():
    """
    Integrates data from multiple sources: API, Excel file, and database.

    Steps:
    1. Fetch data from a web API.
    2. Read data from a local Excel file.
    3. Query data from a database.
    4. Combine the data into a single DataFrame.
    5. Remove duplicate entries based on 'order_id'.
    """

    # Fetch data from the web API
    api_data = requests.get('https://api.example.com/online_order_data').json()

    # Read data from a local Excel file
    excel_data = pd.read_excel('local_orders_data.xlsx')

    # Query data from a database
    db_data = pd.read_sql('SELECT * FROM orders', db_engine)

    # Combine all sources into a single DataFrame
    combined_data = pd.concat(
        [pd.DataFrame(api_data), excel_data, db_data],
        ignore_index=True
    )

    # Remove duplicates based on 'order_id'
    combined_data = combined_data.drop_duplicates(subset=['order_id'])

    return combined_data

# Real-Time Data Stream Processing

In [4]:
!pip install kafka

Collecting kafka
  Downloading kafka-1.3.5-py2.py3-none-any.whl.metadata (6.9 kB)
Downloading kafka-1.3.5-py2.py3-none-any.whl (207 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/207.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.2/207.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kafka
Successfully installed kafka-1.3.5


In [6]:
from kafka import KafkaConsumer
import json

def process_stream_data():
    """
    Processes real-time data from a Kafka topic.

    Steps:
    1. Consume messages from the 'order_topic' Kafka topic.
    2. Deserialize the incoming data from JSON format.
    3. Perform a simple transformation on the data.
    4. Save the transformed data to a database.
    """

    # Initialize the Kafka consumer to consume messages from 'order_topic'
    consumer = KafkaConsumer(
        'order_topic',
        bootstrap_servers=['localhost:9092'],
        value_deserializer=lambda x: json.loads(x.decode('utf-8'))  # Decode and parse JSON
    )

    # Process messages as they arrive
    for msg in consumer:
        data = msg.value  # Extract the message content

        # Transform the data into a structured format
        transformed = {
            'order_id': data['id'],          # Map 'id' to 'order_id'
            'order_amount': float(data['amt']),    # Convert 'amt' to float
            'timestamp': data['ts']          # Retain the 'ts' as 'timestamp'
        }

        # Save the transformed data to the database
        save_to_database(transformed)

# Database Incremental Sync

In [7]:
from sqlalchemy import create_engine
import pandas as pd

def sync_incremental_data():
    """
    Synchronizes incremental data from a source database to a target database.

    Steps:
    1. Connect to the source and target databases.
    2. Retrieve the last synchronization timestamp.
    3. Query only the new or updated data since the last sync.
    4. Process the data in chunks and insert it into the target database.
    """

    # Connect to the source database (MySQL)
    source = create_engine('mysql://user:pass@localhost/source_db')

    # Connect to the target database (PostgreSQL)
    target = create_engine('postgresql://user:pass@localhost/target_db')

    # Retrieve the last synchronization timestamp
    last_sync = get_last_sync_time()

    # Query to fetch only the incremental data
    query = f"""
    SELECT * FROM orders
    WHERE update_time > '{last_sync}'
    """

    # Define the chunk size for batch processing
    chunk_size = 100

    # Read data in chunks from the source database and insert into the target database
    for chunk in pd.read_sql(query, source, chunksize=chunk_size):
        # Append each chunk to the 'orders' table in the target database
        chunk.to_sql('orders', target, if_exists='append', index=False)

# Exporting Data into Reports

In [8]:
import pandas as pd
import matplotlib.pyplot as plt

def generate_report(df):
    """
    Generates a report summarizing sales and quantity data, creates an Excel report,
    and saves a bar chart visualization.

    Parameters:
    - df (DataFrame): Input data containing sales and quantity information.
    """

    # Aggregate data: Summarize total and average sales, and total quantity per category
    summary = (
        df.groupby('category')
        .agg({
            'order_amout': ['sum', 'mean'],  # Calculate total and average sales
            'quantity': 'sum'         # Calculate total quantity
        })
        .round(2)  # Round values to 2 decimal places
    )

    # Create an Excel report with the summary data
    with pd.ExcelWriter('report.xlsx') as writer:
        summary.to_excel(writer, sheet_name='Summary')  # Write summary to the 'Summary' sheet

    # Generate a bar chart for total sales per category
    plt.figure(figsize=(12, 8))  # Set the figure size
    summary['order_amount']['sum'].plot(kind='bar', title='Total order_amount by Category')  # Create a bar chart
    plt.xlabel('Category')  # Label the x-axis
    plt.ylabel('Total Sales')  # Label the y-axis
    plt.tight_layout()  # Adjust layout for better appearance
    plt.savefig('order_amount_chart.png')  # Save the chart as a PNG file