### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [1]:
import pandas as pd
metadata = {
    'columns': {
        'name': {'type': 'string', 'required': True},
        'age': {'type': 'integer', 'required': True, 'min': 0, 'max': 120},
        'salary': {'type': 'float', 'required': True, 'min': 0.0}
    }
}
data = {
    'name': ['Alice', 'Bob', 'Charlie', None, 'David'],
    'age': [30, 25, -5, 22, 150],  # Invalid age for Charlie and David
    'salary': [50000.0, 60000.0, None, 45000.0, 70000.0]
}
df = pd.DataFrame(data)
def validate_data(df, metadata):
    valid_rows = []
    for index, row in df.iterrows():
        is_valid = True
        for column, rules in metadata['columns'].items():
            value = row[column]
            if rules['required'] and pd.isnull(value):
                is_valid = False
                break
            if rules['type'] == 'integer':
                if not (isinstance(value, int) and rules.get('min', float('-inf')) <= value <= rules.get('max', float('inf'))):
                    is_valid = False
                    break
            elif rules['type'] == 'float':
                if not (isinstance(value, float) or (pd.isnull(value) and 'min' in rules)):
                    is_valid = False
                    break
        if is_valid:
            valid_rows.append(row)
    return pd.DataFrame(valid_rows)
valid_data = validate_data(df, metadata)
print("Valid Data:")
print(valid_data)


Valid Data:
    name  age   salary
0  Alice   30  50000.0
1    Bob   25  60000.0
