[Reference](https://medium.com/@samzamany/unit-testing-in-data-engineering-a-practical-guide-91196afdf32a)

In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Function to load the CSV
def load_data(file_name):
    data = pd.read_csv(file_name)
    return data

# Function to clean the data
def clean_data(data):
    data = data.dropna()
    return data

# Function to save the data to a SQL database
def save_data(data, db_string, table_name):
    engine = create_engine(db_string)
    data.to_sql(table_name, engine, if_exists='replace')

# Run pipeline
data = load_data('data.csv')
data = clean_data(data)
save_data(data, 'sqlite:///database.db', 'my_table')

In [2]:
import os
import pandas as pd
import pytest
from sqlalchemy import create_engine, inspect

# Use pytest fixtures to set up a temporary CSV file and SQLite database
@pytest.fixture
def csv_file(tmp_path):
    data = pd.DataFrame({
        'name': ['John', 'Jane', 'Doe'],
        'age': [34, None, 56]  # Jane's age is missing
    })
    file_path = tmp_path / "data.csv"
    data.to_csv(file_path, index=False)
    return file_path


@pytest.fixture
def sqlite_db(tmp_path):
    file_path = tmp_path / "database.db"
    return 'sqlite:///' + str(file_path)


def test_load_data(csv_file):
    data = load_data(csv_file)

    assert 'name' in data.columns
    assert 'age' in data.columns
    assert len(data) == 3


def test_clean_data(csv_file):
    data = load_data(csv_file)
    data = clean_data(data)

    assert data['age'].isna().sum() == 0
    assert len(data) == 2  # Jane's record should be removed


def test_save_data(csv_file, sqlite_db):
    data = load_data(csv_file)
    data = clean_data(data)
    save_data(data, sqlite_db, 'my_table')

    # Check the data was saved correctly
    engine = create_engine(sqlite_db)
    inspector = inspect(engine)
    tables = inspector.get_table_names()

    assert 'my_table' in tables

    loaded_data = pd.read_sql('my_table', engine)
    assert len(loaded_data) == 2  # Only John and Doe's records should be present

In [3]:
import pandas as pd
import pytest

def test_convert_date():
    # Test with valid dates
    test_data = pd.DataFrame({
        'date': ['2021-01-01', '2021-01-02']
    })

    converted_data = convert_date(test_data.copy(), 'date')

    assert pd.api.types.is_datetime64_any_dtype(converted_data['date'])
    assert converted_data.loc[0, 'date'] == pd.Timestamp('2021-01-01')
    assert converted_data.loc[1, 'date'] == pd.Timestamp('2021-01-02')

    # Test with an invalid date
    test_data = pd.DataFrame({
        'date': ['2021-13-01']  # This date is invalid because there's no 13th month
    })

    with pytest.raises(ValueError):
        convert_date(test_data, 'date')

In [4]:
import pandas as pd
import pytest

def test_aggregate_sales():
    # Test data with sales for each region
    test_data = pd.DataFrame({
        'region': ['North', 'North', 'South', 'South', 'East', 'East', 'West', 'West'],
        'sales': [100, 200, 300, 400, 500, 600, 700, 800]
    })

    aggregated = aggregate_sales(test_data)

    assert aggregated.loc[aggregated['region'] == 'North', 'sales'].values[0] == 300
    assert aggregated.loc[aggregated['region'] == 'South', 'sales'].values[0] == 700
    assert aggregated.loc[aggregated['region'] == 'East', 'sales'].values[0] == 1100
    assert aggregated.loc[aggregated['region'] == 'West', 'sales'].values[0] == 1500

    # Test with no sales data
    test_data = pd.DataFrame({
        'region': ['North', 'South', 'East', 'West'],
        'sales': [0, 0, 0, 0]
    })

    aggregated = aggregate_sales(test_data)

    assert aggregated.loc[aggregated['region'] == 'North', 'sales'].values[0] == 0
    assert aggregated.loc[aggregated['region'] == 'South', 'sales'].values[0] == 0
    assert aggregated.loc[aggregated['region'] == 'East', 'sales'].values[0] == 0
    assert aggregated.loc[aggregated['region'] == 'West', 'sales'].values[0] == 0