In [16]:
# Setup: imports, settings, and helpers
import os
import sys
import json
import math
import numpy as np
import pandas as pd
import pandera as pa
from pandera import Column, DataFrameSchema, Check
import warnings

In [17]:
warnings.filterwarnings('ignore')
np.set_printoptions(precision=4, suppress=True)
pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 160)

# Paths
DATA_PATH = 'lending_club_dataset.csv'

In [18]:
# Utility: safe read CSV with basic dtype hints
def read_csv_safely(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at {path}. Please ensure the CSV is in the workspace root.")
    # Low-memory to reduce dtype inference churn
    return pd.read_csv(path, low_memory=False)

In [20]:
# Load dataset
try:
    df_raw = read_csv_safely(DATA_PATH)
    print("Loaded dataset with shape:", df_raw.shape)
except Exception as e:
    print("Failed to load dataset:", e)
    raise

Loaded dataset with shape: (10000, 28)


In [22]:
# Build a schema/profile summary
def top5_summary(s: pd.Series) -> str:
    vc = s.value_counts(dropna=False).head(5)
    # Format as "value:count" pairs; include NaN as literal 'NaN'
    def fmt_val(v):
        if pd.isna(v):
            return 'NaN'
        # truncate long strings for readability
        txt = str(v)
        return (txt[:60] + '…') if len(txt) > 60 else txt
    return "; ".join([f"{fmt_val(idx)}:{int(cnt)}" for idx, cnt in vc.items()])

schema_profile = pd.DataFrame({
    'column': df_raw.columns,
    'dtype': [df_raw[c].dtype.name for c in df_raw.columns],
    'n_unique': [df_raw[c].nunique(dropna=True) for c in df_raw.columns],
    'n_missing': df_raw.isna().sum().to_list(),
    'top_5_values': [top5_summary(df_raw[c]) for c in df_raw.columns],
})

schema_profile

Unnamed: 0,column,dtype,n_unique,n_missing,top_5_values
0,Id,int64,10000,0,1:1; 6671:1; 6664:1; 6665:1; 6666:1
1,is_bad,int64,2,0,0:8705; 1:1295
2,emp_title,object,8183,592,NaN:592; US Army:37; Bank of America:23; IBM:2...
3,emp_length,object,14,0,10:2160; 1:2083; 2:1183; 3:1010; 4:889
4,home_ownership,object,5,0,RENT:4745; MORTGAGE:4445; OWN:775; OTHER:34; N...
5,annual_inc,float64,1901,1,60000.0:381; 50000.0:267; 40000.0:222; 75000.0...
6,verification_status,object,3,0,not verified:4367; VERIFIED - income:3214; VER...
7,pymnt_plan,object,2,0,n:9998; y:2
8,Notes,object,6760,3231,NaN:3231; Personal Loan:3; Debt Consolidation:...
9,purpose_cat,object,27,0,debt consolidation:4454; credit card:1273; oth...


In [26]:
def summarize_near_constant_features(
    df: pd.DataFrame,
    *,
    dominant_thresh: float = 0.98,   # flag if top value covers ≥ this fraction of non-null values
    max_unique_for_flag: int | None = None,  # optionally also require unique values ≤ this number
    min_non_null: int = 5,           # skip columns with too few non-nulls
    treat_bool_as_categorical: bool = True
) -> pd.DataFrame:
    """
    Analyze columns for 'near-constant' behavior and return a summary DataFrame.

    Columns are flagged when the dominant (most frequent) value's share among non-null values
    is ≥ dominant_thresh. Optionally (if max_unique_for_flag is set), we also require that the
    number of unique non-null values ≤ max_unique_for_flag.

    Parameters
    ----------
    df : pd.DataFrame
        Input data.
    dominant_thresh : float, default 0.98
        Threshold for dominant value proportion to flag near-constant.
    max_unique_for_flag : int or None, default None
        If set, near-constant flag additionally requires unique_count ≤ this.
        (E.g., set to 2 to flag only nearly-all-0/1 columns.)
    min_non_null : int, default 5
        Skip columns with fewer than this many non-null observations.
    treat_bool_as_categorical : bool, default True
        If True, boolean columns are summarized as categoricals.

    Returns
    -------
    pd.DataFrame
        Columns:
        - column
        - dtype
        - non_null_count
        - missing_rate
        - unique_count
        - dominant_value
        - dominant_count
        - dominant_share
        - minority_count
        - variance (numeric only; else NaN)
        - std (numeric only; else NaN)
        - entropy_bits (Shannon entropy base-2 on value distribution)
        - is_binary_like (unique_count == 2)
        - near_constant_flag (boolean)
    """
    summaries = []

    for col in df.columns:
        s = df[col]
        non_null = s.dropna()
        non_null_count = int(non_null.shape[0])
        missing_rate = 1.0 - (non_null_count / max(1, s.shape[0]))

        if non_null_count < min_non_null:
            # Not enough data to assess; still record minimal info
            summaries.append({
                "column": col,
                "dtype": s.dtype.name,
                "non_null_count": non_null_count,
                "missing_rate": missing_rate,
                "unique_count": non_null.nunique(dropna=True),
                "dominant_value": np.nan,
                "dominant_count": 0,
                "dominant_share": np.nan,
                "minority_count": 0,
                "variance": np.nan,
                "std": np.nan,
                "entropy_bits": np.nan,
                "is_binary_like": False,
                "near_constant_flag": False
            })
            continue

        # Handle dtype classification
        is_bool = pd.api.types.is_bool_dtype(s)
        is_numeric = pd.api.types.is_numeric_dtype(s) and not (is_bool and treat_bool_as_categorical)

        # Value counts for frequencies
        vc = non_null.value_counts(dropna=False)
        dominant_value = vc.index[0]
        dominant_count = int(vc.iloc[0])
        unique_count = int(vc.shape[0])
        dominant_share = dominant_count / non_null_count
        minority_count = non_null_count - dominant_count

        # Entropy (base-2)
        probs = (vc / non_null_count).to_numpy()
        entropy_bits = float(-np.sum(probs * np.log2(probs))) if unique_count > 1 else 0.0

        # Numeric variance/std if applicable
        if is_numeric:
            variance = float(non_null.astype(float).var(ddof=1)) if non_null_count > 1 else 0.0
            std = float(np.sqrt(variance))
        else:
            variance = np.nan
            std = np.nan

        # Binary-like flag
        is_binary_like = (unique_count == 2)

        # Near-constant logic
        meets_share = dominant_share >= dominant_thresh
        meets_unique = True if max_unique_for_flag is None else (unique_count <= max_unique_for_flag)
        near_constant_flag = bool(meets_share and meets_unique)

        summaries.append({
            "column": col,
            "dtype": s.dtype.name,
            "non_null_count": non_null_count,
            "missing_rate": missing_rate,
            "unique_count": unique_count,
            "dominant_value": dominant_value,
            "dominant_count": dominant_count,
            "dominant_share": dominant_share,
            "minority_count": minority_count,
            "variance": variance,
            "std": std,
            "entropy_bits": entropy_bits,
            "is_binary_like": is_binary_like,
            "near_constant_flag": near_constant_flag
        })

    out = pd.DataFrame(summaries)
    # Order: flags first, then by dominant_share desc, then low entropy
    out = out.sort_values(
        by=["near_constant_flag", "dominant_share", "entropy_bits"],
        ascending=[False, False, True],
        kind="mergesort"
    ).reset_index(drop=True)
    return out

df_summary = summarize_near_constant_features(df_raw)
df_summary

Unnamed: 0,column,dtype,non_null_count,missing_rate,unique_count,dominant_value,dominant_count,dominant_share,minority_count,variance,std,entropy_bits,is_binary_like,near_constant_flag
0,collections_12_mths_ex_med,float64,9968,0.0032,1,0.0,9968,1.0,0,0.0,0.0,0.0,False,True
1,pymnt_plan,object,10000,0.0,2,n,9998,0.9998,2,,,0.002746,True,True
2,initial_list_status,object,10000,0.0,2,f,9983,0.9983,17,,,0.018091,True,True
3,pub_rec,float64,9995,0.0005,4,0.0,9422,0.942671,573,0.06312406,0.251245,0.332413,False,False
4,delinq_2yrs,float64,9995,0.0005,10,0.0,8910,0.891446,1085,0.2563092,0.50627,0.617712,False,False
5,is_bad,int64,10000,0.0,2,0,8705,0.8705,1295,0.112741,0.335769,0.556065,True,False
6,home_ownership,object,10000,0.0,5,RENT,4745,0.4745,5255,,,1.345444,False,False
7,inq_last_6mths,float64,9995,0.0005,20,0.0,4602,0.46043,5393,2.178729,1.476052,2.054252,False,False
8,purpose_cat,object,10000,0.0,27,debt consolidation,4454,0.4454,5546,,,2.827752,False,False
9,verification_status,object,10000,0.0,3,not verified,4367,0.4367,5633,,,1.543587,False,False
