In [26]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

print(" Starting Data Cleaning Notebook...")


# ---------------------------
# Load dataset from Notebook 01 or fallback to CSV
# ---------------------------

try:
    current_df
    print(" Using DataFrame from previous notebook.")
except NameError:
    print("⚠️ DataFrame not found. Loading from raw CSV...")
    fallback_path = "../data/sample.csv"
    current_df = pd.read_csv(fallback_path)
    print(f" Loaded fallback dataset from: {fallback_path}")


# ---------------------------
# Load schema from Notebook 02
# ---------------------------

try:
    full_schema
    print(" Using schema from previous notebook.")
except NameError:
    print(" Schema not found!")
    print("Please run Notebook 02 (schema detector) first.")


 Starting Data Cleaning Notebook...
 Using DataFrame from previous notebook.
 Using schema from previous notebook.


In [27]:
# If schema is not available, regenerate it using Notebook 02 logic

try:
    full_schema
    print(" Schema loaded from memory.")
except NameError:
    print(" Schema not found. Regenerating schema...")

    # Import schema detection functions dynamically
    try:
        # If we're running notebooks in order, these functions exist
        schema = detect_column_types(current_df)
        id_cols = detect_id_columns(current_df)
        target_cols = detect_target_column(current_df, id_cols)
        low_var_cols = detect_low_variance_columns(current_df)

        full_schema = build_schema_object(
            current_df,
            schema,
            id_cols,
            target_cols,
            low_var_cols
        )

        print(" Schema regenerated successfully!")

    except Exception as e:
        print(" Failed to regenerate schema:")
        print(e)
        print("Please run Notebook 02 before this.")


 Schema loaded from memory.


In [28]:
# Re-import schema detection functions from Notebook 02 
# (temporary duplication until Phase 2 refactor)
# ------------------------------------------------------------

def detect_column_types(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    boolean_cols = df.select_dtypes(include=['bool']).columns.tolist()

    # Safe datetime detection (only check string columns)
    datetime_cols = []
    for col in categorical_cols:
        try:
            pd.to_datetime(df[col], errors='raise')
            datetime_cols.append(col)
        except:
            pass

    # remove datetime from categoricals
    categorical_cols = [c for c in categorical_cols if c not in datetime_cols]

    return {
        "numeric": numeric_cols,
        "categorical": categorical_cols,
        "boolean": boolean_cols,
        "datetime": datetime_cols
    }


def detect_id_columns(df):
    n_rows = len(df)
    id_cols = []

    for col in df.columns:
        if df[col].nunique() == n_rows:
            id_cols.append(col)
        elif "id" in col.lower():
            id_cols.append(col)

    return id_cols


def detect_target_column(df, id_cols):
    candidates = []

    for col in df.columns:
        if col in id_cols:
            continue
        if df[col].dtype == 'object' and df[col].nunique() <= 30:
            candidates.append(col)

    if len(candidates) == 0:
        candidates.append(df.columns[-1])

    return candidates


def detect_low_variance_columns(df, threshold=0.99):
    low_var_cols = []
    for col in df.columns:
        top_freq = df[col].value_counts(normalize=True).max()
        if top_freq >= threshold:
            low_var_cols.append(col)
    return low_var_cols


def build_schema_object(df, schema, id_cols, target_cols, low_var_cols):
    return {
        "numeric_columns": schema["numeric"],
        "categorical_columns": schema["categorical"],
        "boolean_columns": schema["boolean"],
        "datetime_columns": schema["datetime"],
        "id_columns": id_cols,
        "target_candidates": target_cols,
        "low_variance_columns": low_var_cols,
        "n_rows": df.shape[0],
        "n_columns": df.shape[1]
    }

print(" Schema detection functions loaded.")


 Schema detection functions loaded.


In [29]:
# Now regenerate schema using the local functions

print(" Regenerating schema using local functions...")

try:
    schema = detect_column_types(current_df)
    id_cols = detect_id_columns(current_df)
    target_cols = detect_target_column(current_df, id_cols)
    low_var_cols = detect_low_variance_columns(current_df)

    full_schema = build_schema_object(
        current_df,
        schema,
        id_cols,
        target_cols,
        low_var_cols
    )

    print(" Schema regenerated successfully!")
    full_schema

except Exception as e:
    print(" Error generating schema:", e)


 Regenerating schema using local functions...
 Schema regenerated successfully!


In [30]:

# Missing Value Imputation (Numeric + Categorical)
# ----------------------------------------------

def impute_missing_values(df, schema):
    """
    Imputes missing values using:
    - median for numeric
    - mode for categorical
    """

    df_cleaned = df.copy()

    numeric_cols = schema["numeric_columns"]
    categorical_cols = schema["categorical_columns"]
    id_cols = schema["id_columns"]

    print(" Starting missing value imputation...")

    # --- Numeric columns: median imputation ---
    for col in numeric_cols:
        if col in id_cols:
            continue  # skip ID columns

        missing_count = df_cleaned[col].isna().sum()

        if missing_count > 0:
            median_value = df_cleaned[col].median()
            df_cleaned[col].fillna(median_value, inplace=True)
            print(f"✔ Filled {missing_count} missing values in '{col}' using median = {median_value}")

    # --- Categorical columns: mode imputation ---
    for col in categorical_cols:
        missing_count = df_cleaned[col].isna().sum()

        if missing_count > 0:
            mode_value = df_cleaned[col].mode()[0]
            df_cleaned[col].fillna(mode_value, inplace=True)
            print(f"✔ Filled {missing_count} missing values in '{col}' using mode = {mode_value}")

    print(" Missing value imputation complete.")
    return df_cleaned


# Run imputation
clean_step_1 = impute_missing_values(current_df, full_schema)
clean_step_1.head()


 Starting missing value imputation...
 Missing value imputation complete.


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [31]:

# Outlier Detection & Handling (IQR + Capping)
# ----------------------------------------------

def handle_outliers(df, schema):
    """
    Detects and caps outliers using the IQR method for numeric columns.
    """

    df_cleaned = df.copy()
    numeric_cols = schema["numeric_columns"]
    id_cols = schema["id_columns"]

    print(" Starting outlier detection & handling...")

    for col in numeric_cols:

        # Skip ID columns (never treat them for outliers)
        if col in id_cols:
            continue

        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        # Count outliers
        outliers = df_cleaned[(df_cleaned[col] < lower) | (df_cleaned[col] > upper)][col].count()

        if outliers > 0:
            print(f" {outliers} outliers detected in '{col}'. Applying capping...")
            df_cleaned[col] = df_cleaned[col].clip(lower, upper)

    print(" Outlier handling complete.")
    return df_cleaned


# Apply outlier handling on the imputed dataset
clean_step_2 = handle_outliers(clean_step_1, full_schema)
clean_step_2.head()


 Starting outlier detection & handling...
 Outlier handling complete.


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [32]:

# Datatype Fixing & Category Standardization
# -----------------------------------------------------------

def fix_datatypes(df, schema):
    """
    Cleans datatypes:
    - Converts numeric-like strings into numeric dtype
    - Standardizes categorical text (strip, title-case)
    - Converts valid datetime columns
    """

    df_cleaned = df.copy()

    numeric_cols = schema["numeric_columns"]
    categorical_cols = schema["categorical_columns"]
    datetime_cols = schema["datetime_columns"]
    id_cols = schema["id_columns"]

    print(" Fixing datatypes & standardizing categories...")

    # --- Fix numeric data stored as strings ---
    for col in numeric_cols:
        if col in id_cols:
            continue  # never touch ID columns

        # Try converting to numeric
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

    # --- Clean categorical text ---
    for col in categorical_cols:
        df_cleaned[col] = (
            df_cleaned[col]
            .astype(str)           # convert everything to string
            .str.strip()           # remove trailing spaces
            .str.replace("_", " ") # replace underscores
            .str.title()           # title case ("iris setosa" -> "Iris Setosa")
        )

    # --- Convert datetime columns ---
    for col in datetime_cols:
        try:
            df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='coerce')
        except:
            pass  # leave it as is if conversion completely fails

    print(" Datatype cleaning complete.")
    return df_cleaned


# Apply datatype cleaning
clean_step_3 = fix_datatypes(clean_step_2, full_schema)
clean_step_3.head()


 Fixing datatypes & standardizing categories...
 Datatype cleaning complete.


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-Setosa
1,2,4.9,3.0,1.4,0.2,Iris-Setosa
2,3,4.7,3.2,1.3,0.2,Iris-Setosa
3,4,4.6,3.1,1.5,0.2,Iris-Setosa
4,5,5.0,3.6,1.4,0.2,Iris-Setosa


In [33]:
def detect_low_variance_columns_fixed(df, schema, threshold=0.99):
    categorical_cols = schema["categorical_columns"]
    low_var_cols = []

    for col in categorical_cols:
        top_freq = df[col].value_counts(normalize=True).max()
        if top_freq >= threshold:
            low_var_cols.append(col)

    return low_var_cols

# -----------------------------------------------------------
# Drop ID Columns + Low-Variance Columns (Corrected)
# -----------------------------------------------------------

def drop_useless_columns(df, schema):
    """
    Removes:
    - ID columns
    - Low-variance columns
    """
    
    df_cleaned = df.copy()
    
    id_cols = schema.get("id_columns", [])
    low_var_cols = schema.get("low_variance_columns", [])
    
    cols_to_drop = id_cols + low_var_cols

    print("Columns to drop:", cols_to_drop)

    df_cleaned = df_cleaned.drop(columns=cols_to_drop, errors='ignore')

    print("Dropping useless columns complete.")
    return df_cleaned



print(" Rebuilding schema cleanly...")

# Step 1 — detect types
schema_types = detect_column_types(current_df)

# Step 2 — detect ID columns
id_cols = detect_id_columns(current_df)

# Step 3 — detect target column
target_cols = detect_target_column(current_df, id_cols)

# Step 4 — detect low-variance columns using FIXED method
low_var_cols = detect_low_variance_columns_fixed(current_df, {
    "categorical_columns": schema_types["categorical"]
})

# Step 5 — build a fully new schema object
full_schema = build_schema_object(
    current_df,
    schema_types,
    id_cols,
    target_cols,
    low_var_cols
)

print(" Schema rebuilt.")
full_schema


clean_step_4 = drop_useless_columns(clean_step_3, full_schema)
clean_step_4.head()



 Rebuilding schema cleanly...
 Schema rebuilt.
Columns to drop: ['Id', 'SepalWidthCm', 'PetalWidthCm']
Dropping useless columns complete.


Unnamed: 0,SepalLengthCm,PetalLengthCm,Species
0,5.1,1.4,Iris-Setosa
1,4.9,1.4,Iris-Setosa
2,4.7,1.3,Iris-Setosa
3,4.6,1.5,Iris-Setosa
4,5.0,1.4,Iris-Setosa


In [34]:
print("Value Counts — SepalWidthCm:")
print(current_df["SepalWidthCm"].value_counts(normalize=True).head(10))

print("\nValue Counts — PetalWidthCm:")
print(current_df["PetalWidthCm"].value_counts(normalize=True).head(10))

print("Unique values in SepalWidthCm:", current_df["SepalWidthCm"].nunique())
print("Unique values in PetalWidthCm:", current_df["PetalWidthCm"].nunique())

print("Stats — SepalWidthCm before and after")
print(clean_step_1["SepalWidthCm"].describe())
print(clean_step_2["SepalWidthCm"].describe())

print("\nStats — PetalWidthCm before and after")
print(clean_step_1["PetalWidthCm"].describe())
print(clean_step_2["PetalWidthCm"].describe())

print(current_df.dtypes)



Value Counts — SepalWidthCm:
SepalWidthCm
3.0    0.173333
2.8    0.093333
3.2    0.086667
3.4    0.080000
3.1    0.080000
2.9    0.066667
2.7    0.060000
2.5    0.053333
3.3    0.040000
3.5    0.040000
Name: proportion, dtype: float64

Value Counts — PetalWidthCm:
PetalWidthCm
0.2    0.186667
1.3    0.086667
1.5    0.080000
1.8    0.080000
1.4    0.053333
2.3    0.053333
0.4    0.046667
1.0    0.046667
0.3    0.046667
0.1    0.040000
Name: proportion, dtype: float64
Unique values in SepalWidthCm: 23
Unique values in PetalWidthCm: 22
Stats — SepalWidthCm before and after
count    150.000000
mean       3.054000
std        0.433594
min        2.000000
25%        2.800000
50%        3.000000
75%        3.300000
max        4.400000
Name: SepalWidthCm, dtype: float64
count    150.000000
mean       3.054000
std        0.433594
min        2.000000
25%        2.800000
50%        3.000000
75%        3.300000
max        4.400000
Name: SepalWidthCm, dtype: float64

Stats — PetalWidthCm before and 

In [35]:
print(" Rebuilding schema with correct logic...")

# Step 1 — detect column types (correct)
schema_types = detect_column_types(current_df)

# Step 2 — detect ID columns
id_cols = detect_id_columns(current_df)

# Step 3 — detect target candidates
target_cols = detect_target_column(current_df, id_cols)

# Step 4 — detect low-variance columns using FIXED method
low_var_cols = detect_low_variance_columns_fixed(
    current_df,
    schema={"categorical_columns": schema_types["categorical"]}
)

# Step 5 — rebuild full schema object PROPERLY
full_schema = {
    "numeric_columns": schema_types["numeric"],
    "categorical_columns": schema_types["categorical"],
    "boolean_columns": schema_types["boolean"],
    "datetime_columns": schema_types["datetime"],
    "id_columns": id_cols,
    "target_candidates": target_cols,
    "low_variance_columns": low_var_cols,
    "n_rows": current_df.shape[0],
    "n_columns": current_df.shape[1]
}

print("✔ Corrected schema:")
full_schema


clean_step_4 = drop_useless_columns(clean_step_3, full_schema)
clean_step_4.head()




 Rebuilding schema with correct logic...
✔ Corrected schema:
Columns to drop: ['Id', 'SepalWidthCm', 'PetalWidthCm']
Dropping useless columns complete.


Unnamed: 0,SepalLengthCm,PetalLengthCm,Species
0,5.1,1.4,Iris-Setosa
1,4.9,1.4,Iris-Setosa
2,4.7,1.3,Iris-Setosa
3,4.6,1.5,Iris-Setosa
4,5.0,1.4,Iris-Setosa


In [36]:
print("Full schema object:")
for k, v in full_schema.items():
    print(k, ":", v)


Full schema object:
numeric_columns : ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
categorical_columns : ['Species']
boolean_columns : []
datetime_columns : []
id_columns : ['Id', 'SepalWidthCm', 'PetalWidthCm']
target_candidates : ['Species']
low_variance_columns : []
n_rows : 150
n_columns : 6


In [37]:
clean_step_4 = drop_useless_columns(clean_step_3, full_schema)
clean_step_4.head()


Columns to drop: ['Id', 'SepalWidthCm', 'PetalWidthCm']
Dropping useless columns complete.


Unnamed: 0,SepalLengthCm,PetalLengthCm,Species
0,5.1,1.4,Iris-Setosa
1,4.9,1.4,Iris-Setosa
2,4.7,1.3,Iris-Setosa
3,4.6,1.5,Iris-Setosa
4,5.0,1.4,Iris-Setosa


In [38]:
print("ID columns detected:", full_schema["id_columns"])


ID columns detected: ['Id', 'SepalWidthCm', 'PetalWidthCm']


In [39]:
schema_types = detect_column_types(current_df)

id_cols = detect_id_columns(current_df)
print("Fresh ID cols:", id_cols)

target_cols = detect_target_column(current_df, id_cols)

low_var_cols = detect_low_variance_columns_fixed(
    current_df,
    schema={"categorical_columns": schema_types["categorical"]}
)

# rebuild schema cleanly
full_schema = {
    "numeric_columns": schema_types["numeric"],
    "categorical_columns": schema_types["categorical"],
    "boolean_columns": schema_types["boolean"],
    "datetime_columns": schema_types["datetime"],
    "id_columns": id_cols,
    "target_candidates": target_cols,
    "low_variance_columns": low_var_cols,
    "n_rows": current_df.shape[0],
    "n_columns": current_df.shape[1]
}

print("FINAL schema:")
for k, v in full_schema.items():
    print(k, ":", v)


Fresh ID cols: ['Id', 'SepalWidthCm', 'PetalWidthCm']
FINAL schema:
numeric_columns : ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
categorical_columns : ['Species']
boolean_columns : []
datetime_columns : []
id_columns : ['Id', 'SepalWidthCm', 'PetalWidthCm']
target_candidates : ['Species']
low_variance_columns : []
n_rows : 150
n_columns : 6


In [40]:
print("Unique in SepalWidth:", clean_step_3["SepalWidthCm"].nunique(), "/", len(clean_step_3))
print("Unique in PetalWidth:", clean_step_3["PetalWidthCm"].nunique(), "/", len(clean_step_3))


Unique in SepalWidth: 23 / 150
Unique in PetalWidth: 22 / 150


In [41]:
print("Fresh ID cols:", id_cols)


Fresh ID cols: ['Id', 'SepalWidthCm', 'PetalWidthCm']


In [42]:
def detect_id_columns(df):
    id_cols = []
    n_rows = len(df)

    print("----- DEBUG detect_id_columns -----")
    print("n_rows =", n_rows)

    for col in df.columns:
        unique_count = df[col].nunique()
        col_lower = col.lower()

        print(f"Column: {col}, unique: {unique_count}, contains 'id'? {'id' in col_lower}")

        # Rule 1: TRUE ID = all unique
        if unique_count == n_rows:
            print(f" → Marked as ID (all values unique)")
            id_cols.append(col)
            continue

        # Rule 2: name contains 'id'
        if "id" in col_lower:
            print(f" → Marked as ID (name contains 'id')")
            id_cols.append(col)

    print("FINAL ID COLS DETECTED:", id_cols)
    print("-----------------------------------")

    return id_cols


In [43]:
id_cols = detect_id_columns(current_df)


----- DEBUG detect_id_columns -----
n_rows = 150
Column: Id, unique: 150, contains 'id'? True
 → Marked as ID (all values unique)
Column: SepalLengthCm, unique: 35, contains 'id'? False
Column: SepalWidthCm, unique: 23, contains 'id'? True
 → Marked as ID (name contains 'id')
Column: PetalLengthCm, unique: 43, contains 'id'? False
Column: PetalWidthCm, unique: 22, contains 'id'? True
 → Marked as ID (name contains 'id')
Column: Species, unique: 3, contains 'id'? False
FINAL ID COLS DETECTED: ['Id', 'SepalWidthCm', 'PetalWidthCm']
-----------------------------------


In [44]:
import re

def detect_id_columns(df):
    id_cols = []
    n_rows = len(df)

    pattern = r'(^id$|^id_|_id$|_id_|^.*_id$|^id.*$)'  # safer rules

    for col in df.columns:
        col_lower = col.lower()

        unique_count = df[col].nunique()

        # Rule 1: all unique → ID
        if unique_count == n_rows:
            id_cols.append(col)
            continue

        # Rule 2: column name contains ID as standalone word
        if re.search(pattern, col_lower):
            id_cols.append(col)

    return id_cols


In [45]:
id_cols = detect_id_columns(current_df)
print(id_cols)

clean_step_4 = drop_useless_columns(clean_step_3, full_schema)



['Id']
Columns to drop: ['Id', 'SepalWidthCm', 'PetalWidthCm']
Dropping useless columns complete.


In [46]:
print(" Rebuilding schema from scratch...")

# Step 1 — detect types
schema_types = detect_column_types(current_df)

# Step 2 — detect ID columns using FIXED FUNCTION
id_cols = detect_id_columns(current_df)
print("Detected ID columns:", id_cols)

# Step 3 — detect target column
target_cols = detect_target_column(current_df, id_cols)

# Step 4 — low variance (categorical only)
low_var_cols = detect_low_variance_columns_fixed(
    current_df,
    schema={"categorical_columns": schema_types["categorical"]}
)

# Step 5 — rebuild final schema object
full_schema = {
    "numeric_columns": schema_types["numeric"],
    "categorical_columns": schema_types["categorical"],
    "boolean_columns": schema_types["boolean"],
    "datetime_columns": schema_types["datetime"],
    "id_columns": id_cols,
    "target_candidates": target_cols,
    "low_variance_columns": low_var_cols,
    "n_rows": current_df.shape[0],
    "n_columns": current_df.shape[1]
}

print("\nFINAL SCHEMA:")
for k, v in full_schema.items():
    print(k, ":", v)


 Rebuilding schema from scratch...
Detected ID columns: ['Id']

FINAL SCHEMA:
numeric_columns : ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
categorical_columns : ['Species']
boolean_columns : []
datetime_columns : []
id_columns : ['Id']
target_candidates : ['Species']
low_variance_columns : []
n_rows : 150
n_columns : 6


In [47]:
clean_step_4 = drop_useless_columns(clean_step_3, full_schema)
clean_step_4.head()


Columns to drop: ['Id']
Dropping useless columns complete.


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-Setosa
1,4.9,3.0,1.4,0.2,Iris-Setosa
2,4.7,3.2,1.3,0.2,Iris-Setosa
3,4.6,3.1,1.5,0.2,Iris-Setosa
4,5.0,3.6,1.4,0.2,Iris-Setosa


In [48]:
# -----------------------------------------------------------
# Save Cleaned Dataset + Cleaning Summary Report
# -----------------------------------------------------------

import os
from datetime import datetime

# Create output directories if not exist
os.makedirs("../data/cleaned", exist_ok=True)
os.makedirs("../results/logs", exist_ok=True)

# File names with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_path = f"../data/cleaned/cleaned_dataset_{timestamp}.csv"
parquet_path = f"../data/cleaned/cleaned_dataset_{timestamp}.parquet"
log_path = f"../results/logs/cleaning_report_{timestamp}.txt"


# ------------------ SAVE CSV ------------------
clean_step_4.to_csv(csv_path, index=False)
print(f" Saved Cleaned CSV: {csv_path}")


# ------------------ SAVE PARQUET ------------------
clean_step_4.to_parquet(parquet_path, index=False)
print(f" Saved Parquet File: {parquet_path}")


# ------------------ SAVE CLEANING SUMMARY ------------------
with open(log_path, "w") as f:
    f.write("=== Cleaning Summary Report ===\n")
    f.write(f"Generated on: {timestamp}\n\n")

    f.write("Original Shape:\n")
    f.write(str(current_df.shape) + "\n\n")

    f.write("Final Cleaned Shape:\n")
    f.write(str(clean_step_4.shape) + "\n\n")

    f.write("Dropped Columns:\n")
    f.write(str(full_schema['id_columns'] + full_schema['low_variance_columns']) + "\n\n")

    f.write("Schema Summary:\n")
    for key, value in full_schema.items():
        f.write(f"{key}: {value}\n")

print(f" Saved Cleaning Summary: {log_path}")

clean_step_4.head()


 Saved Cleaned CSV: ../data/cleaned/cleaned_dataset_20251127_225256.csv


 Saved Parquet File: ../data/cleaned/cleaned_dataset_20251127_225256.parquet
 Saved Cleaning Summary: ../results/logs/cleaning_report_20251127_225256.txt


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-Setosa
1,4.9,3.0,1.4,0.2,Iris-Setosa
2,4.7,3.2,1.3,0.2,Iris-Setosa
3,4.6,3.1,1.5,0.2,Iris-Setosa
4,5.0,3.6,1.4,0.2,Iris-Setosa
