In [1]:
import pandas as pd
import os

def clean_csv(input_path, required_cols):
    df = pd.read_csv(input_path)
    
    for col in required_cols:
        # Ensure column exists
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in CSV")
        # Strip whitespace and replace empty-like values with NA
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace(['', 'nan', 'NaN', 'None', None], pd.NA)
    
    cleaned_df = df.dropna(subset=required_cols)
    
    dir_name, file_name = os.path.split(input_path)
    base, ext = os.path.splitext(file_name)
    output_path = os.path.join(dir_name, f"{base}_no_na{ext}")
    
    cleaned_df.to_csv(output_path, index=False)
    print(f"Saved cleaned CSV to: {output_path}")
    print(f"Dropped rows: {len(df) - len(cleaned_df)} / {len(df)}")
    return output_path

# Example call (replace with your actual path and columns)
input_path = "/Users/borismartinez/Documents/GitHub/engage/data/vr_blocks_export.csv"
required_cols = ["first_name", "last_name"]

cleaned_file_path = clean_csv(input_path, required_cols)

  df = pd.read_csv(input_path)


Saved cleaned CSV to: /Users/borismartinez/Documents/GitHub/engage/data/vr_blocks_export_no_na.csv
Dropped rows: 33607 / 84616
