[Reference](https://python.plainenglish.io/how-i-automated-data-cleaning-in-python-using-functions-and-pipelines-95b8ad0f6ea5)

# Step 1: Automating Missing Values Handling with Python Functions

In [1]:
import pandas as pd

# Define a reusable function to handle missing values
def handle_missing_values(df, method='mean', fill_value=None):
    if method == 'drop':
        return df.dropna()
    elif method == 'fill':
        return df.fillna(fill_value)
    elif method == 'mean':
        numeric_cols = df.select_dtypes(include='number').columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
        return df
    else:
        raise ValueError("Invalid method provided")

# Example dataset
data = {'Name': ['Alice', 'Bob', None, 'David'],
        'Age': [25, None, 30, 22],
        'Salary': [50000, 60000, None, 45000]}
df = pd.DataFrame(data)

# Use the function to handle missing values by filling with the mean
cleaned_df = handle_missing_values(df, method='mean')
print(cleaned_df)

    Name        Age        Salary
0  Alice  25.000000  50000.000000
1    Bob  25.666667  60000.000000
2   None  30.000000  51666.666667
3  David  22.000000  45000.000000


# Step 2: Removing Duplicates Efficiently

In [2]:
# Define a function to remove duplicates based on specific columns
def remove_duplicates(df, subset=None):
    return df.drop_duplicates(subset=subset)

# Example dataset with duplicates
data = {'Name': ['Alice', 'Bob', 'Alice', 'David'],
        'Age': [25, 30, 25, 22],
        'Salary': [50000, 60000, 50000, 45000]}
df = pd.DataFrame(data)

# Remove duplicates based on the 'Name' column
cleaned_df = remove_duplicates(df, subset=['Name'])
print(cleaned_df)

    Name  Age  Salary
0  Alice   25   50000
1    Bob   30   60000
3  David   22   45000


# Step 3: Transforming Data Types in a Pipeline

In [3]:
# Define a function to transform data types
def transform_data_types(df, col_types):
    for col, dtype in col_types.items():
        df[col] = df[col].astype(dtype)
    return df

# Example dataset with incorrect data types
data = {'Name': ['Alice', 'Bob', 'David'],
        'Age': ['25', '30', '22'],
        'Salary': ['50000', '60000', '45000']}
df = pd.DataFrame(data)

# Specify the correct data types
col_types = {'Age': 'int', 'Salary': 'float'}

# Apply the transformation
cleaned_df = transform_data_types(df, col_types)
print(cleaned_df.dtypes)

Name       object
Age         int64
Salary    float64
dtype: object


# Step 4: Building an Automated Data Cleaning Pipeline

In [5]:
# Build a complete data cleaning pipeline
def data_cleaning_pipeline(df, missing_values_method='mean', fill_value=None, subset=None, col_types=None):
    # Handle missing values
    df = handle_missing_values(df, method=missing_values_method, fill_value=fill_value)

    # Remove duplicates
    df = remove_duplicates(df, subset=subset)

    # Transform data types
    if col_types:
        df = transform_data_types(df, col_types)

    return df

# Example dataset with various issues
data = {'Name': ['Alice', 'Bob', None, 'Alice'],
        'Age': ['25', None, '30', '22'],
        'Salary': [50000, 60000, None, 50000]}

df = pd.DataFrame(data)

# Define data types and run the pipeline
col_types = {'Age': 'int', 'Salary': 'float'}
cleaned_df = data_cleaning_pipeline(df, missing_values_method='mean', subset=['Name'], col_types=col_types)
print(cleaned_df)