**Imports**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import simplejson
import matplotlib.pyplot as plt
from pathlib import Path
from typing import List, Dict, Tuple
from sklearn.linear_model import LinearRegression

**Code**

In [5]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = clean_types(df)
    df = clean_missing(df)
    df = handle_outliers(df)
    df = clean_strings_and_dates(df)
    validate_cleaned(df)
    
    print("Data cleaning complete!")  # Added print to indicate it worked
    return df

# Dummy implementations for the helper functions so it runs
def clean_types(df): return df
def clean_missing(df): return df
def handle_outliers(df): return df
def clean_strings_and_dates(df): return df
def validate_cleaned(df): pass

# Example usage
sample_df = pd.DataFrame({
    "user_id": [1, 2, 3],
    "age": [25, None, 30],
    "income": [50000, 60000, None]
})

cleaned_df = clean_data(sample_df)
print("\nCleaned DataFrame:")
print(cleaned_df)

Data cleaning complete!

Cleaned DataFrame:
   user_id   age   income
0        1  25.0  50000.0
1        2   NaN  60000.0
2        3  30.0      NaN


In [6]:
def clean_types(df):
    """
    Convert columns to numeric types safely.
    """
    out = df.copy()
    out["age"] = pd.to_numeric(out["age"], errors="coerce")
    out["income"] = pd.to_numeric(out["income"], errors="coerce")
    return out

def clean_missing(df):
    """
    Fill missing values in the 'age' column with the median.
    """
    out = df.copy()
    out["age"] = out["age"].fillna(out["age"].median())
    return out

# Example usage
sample_df = pd.DataFrame({
    "user_id": [1, 2, 3],
    "age": [25, None, 30],
    "income": ["50000", "60000", None]
})

# Apply the functions
df_types = clean_types(sample_df)
df_cleaned = clean_missing(df_types)

print("After clean_types:")
print(df_types)
print("\nAfter clean_missing:")
print(df_cleaned)

After clean_types:
   user_id   age   income
0        1  25.0  50000.0
1        2   NaN  60000.0
2        3  30.0      NaN

After clean_missing:
   user_id   age   income
0        1  25.0  50000.0
1        2  27.5  60000.0
2        3  30.0      NaN


In [7]:
import pandas as pd

# --------------------------
# Dummy helper functions
# --------------------------
def clean_types(df):
    df = df.copy()
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    df["income"] = pd.to_numeric(df["income"], errors="coerce")
    return df

def clean_missing(df):
    df = df.copy()
    df["age"] = df["age"].fillna(df["age"].median())
    return df

def handle_outliers(df):
    print("Handling outliers...")  # placeholder
    return df

def clean_strings_and_dates(df):
    print("Cleaning strings and dates...")  # placeholder
    return df

def add_features(df):
    print("Adding derived features...")  # placeholder
    return df

# --------------------------
# Main cleaning pipeline
# --------------------------
def clean_data(df):
    """
    Full data cleaning pipeline.
    Steps:
    1. Clean types
    2. Handle missing values
    3. Handle outliers
    4. Clean strings and dates
    5. Add derived features
    """
    df = clean_types(df)                 # 1. Types first
    df = clean_missing(df)               # 2. Then missing values
    df = handle_outliers(df)             # 3. Then outliers
    df = clean_strings_and_dates(df)     # 4. Then strings/dates
    df = add_features(df)                # 5. Finally derived features
    return df

# --------------------------
# Example usage
# --------------------------
sample_df = pd.DataFrame({
    "user_id": [1, 2, 3],
    "age": [25, None, 30],
    "income": ["50000", "60000", None],
    "city": ["new york", "NYC", "san francisco"]
})

cleaned_df = clean_data(sample_df)
print("\nCleaned DataFrame:")
print(cleaned_df)

Handling outliers...
Cleaning strings and dates...
Adding derived features...

Cleaned DataFrame:
   user_id   age   income           city
0        1  25.0  50000.0       new york
1        2  27.5  60000.0            NYC
2        3  30.0      NaN  san francisco


In [8]:
# --------------------------
# Validation function
# --------------------------
def validate_cleaned(df):
    """
    Validate that the cleaned DataFrame meets basic quality checks.
    """
    assert df["age"].min() >= 0, "Negative ages found"
    assert df["income"].notna().all(), "Income still has NaN"
    print("Validation passed ✅")

# --------------------------
# Logging function
# --------------------------
def log_summary(df, step_name):
    """
    Print a quick summary of the DataFrame at a given step.
    """
    print(f"--- {step_name} ---")
    print(f"Rows: {len(df)}")
    print(f"Missing age: {df['age'].isna().sum()}")
    print(f"Missing income: {df['income'].isna().sum()}")

# --------------------------
# Example usage
# --------------------------
sample_df = pd.DataFrame({
    "age": [25, None, 30, -1],
    "income": [50000, 60000, None, 70000]
})

# Log before cleaning
log_summary(sample_df, "Before Cleaning")

# Fill missing age and income for demonstration
sample_df["age"] = sample_df["age"].apply(lambda x: x if x >= 0 else 0)
sample_df["age"] = sample_df["age"].fillna(sample_df["age"].median())
sample_df["income"] = sample_df["income"].fillna(sample_df["income"].median())

# Validate and log after cleaning
validate_cleaned(sample_df)
log_summary(sample_df, "After Cleaning")

--- Before Cleaning ---
Rows: 4
Missing age: 1
Missing income: 1
Validation passed ✅
--- After Cleaning ---
Rows: 4
Missing age: 0
Missing income: 0


In [9]:
def safe_to_numeric(s, col_name):
    """
    Safely convert a Series to numeric, warn if invalid values are found.
    
    Parameters:
    - s: pd.Series to convert
    - col_name: str, name of the column (used for warning)
    
    Returns:
    - pd.Series of numeric values (NaN where conversion failed)
    """
    converted = pd.to_numeric(s, errors="coerce")
    
    # Count invalid conversions (excluding original NaNs)
    n_invalid = converted.isna().sum() - s.isna().sum()
    
    if n_invalid > 0:
        print(f"[WARN] {n_invalid} invalid values in '{col_name}'")
    
    return converted

# --------------------------
# Example usage
# --------------------------
sample_series = pd.Series(["10", "20", "abc", "30", None])
converted_series = safe_to_numeric(sample_series, "test_column")

print("\nConverted Series:")
print(converted_series)

[WARN] 1 invalid values in 'test_column'

Converted Series:
0    10.0
1    20.0
2     NaN
3    30.0
4     NaN
dtype: float64
