In [1]:
"""
Check if the order of the dataframes
stay same after imputation.
"""

'\nCheck if the order of the dataframes\nstay same after imputation.\n'

In [2]:
import csv
import pandas as pd
import numpy as np

In [12]:
impute_df = pd.read_csv(r"CSV/Imports/imputed/o4_X_validate_rank1.csv", low_memory=False)

missing_df = pd.read_csv(r"CSV/Imports/original/o4_X_validate.csv", low_memory=False)

In [13]:
# Check if there are any missing values in the whole dataframe
if impute_df.isnull().values.any():
    total_missing = impute_df.isnull().sum().sum()
    print(f"There are {total_missing} missing values in temp_impute_df.")
    
    # Show counts per column (only those with missing values)
    print("\nMissing values per column:")
    print(impute_df.isnull().sum()[impute_df.isnull().sum() > 0])
else:
    print("No missing values found in temp_impute_df.")

No missing values found in temp_impute_df.


In [14]:
# Align dataframes on common columns and index
common_cols = missing_df.columns.intersection(impute_df.columns)
missing_al = missing_df[common_cols].copy()
impute_al  = impute_df[common_cols].copy()

# Mask: only cells that are non-NaN in the original missing_df
non_missing_mask = missing_al.notna()

# Prepare an empty mask for differences
diff_mask = pd.DataFrame(False, index=missing_al.index, columns=missing_al.columns)

# Split numeric vs non-numeric columns
num_cols = missing_al.select_dtypes(include=[np.number]).columns
non_num_cols = [c for c in missing_al.columns if c not in num_cols]

# --- Numeric columns: use tolerance ---
for c in num_cols:
    a = missing_al[c]
    b = impute_al[c]
    mask = non_missing_mask[c]
    # np.isclose: True if close -> so we invert for "different"
    diff_mask.loc[mask, c] = ~np.isclose(
        a[mask].to_numpy(),
        b[mask].to_numpy(),
        rtol=1e-07, atol=1e-07, equal_nan=True
    )

# --- Non-numeric columns: exact match ---
for c in non_num_cols:
    a = missing_al[c]
    b = impute_al[c]
    mask = non_missing_mask[c]
    diff_mask.loc[mask, c] = (a[mask] != b[mask])

# Build long table of differences
rows, cols = np.where(diff_mask.values)
records = []
for r, c in zip(rows, cols):
    records.append({
        "row": missing_al.index[r],
        "column": missing_al.columns[c],
        "original": missing_al.iat[r, c],
        "imputed": impute_al.iat[r, c]
    })

diff_df = pd.DataFrame(records)

print(f"Found {len(diff_df)} cells that were non-NaN in missing_df but changed in impute_df "
      f"(with tolerance).")
display(diff_df.head(50))

Found 0 cells that were non-NaN in missing_df but changed in impute_df (with tolerance).
