In [1]:
"""
Check if the order of the dataframes
stay same after imputation.
"""

'\nCheck if the order of the dataframes\nstay same after imputation.\n'

In [2]:
import csv
import pandas as pd
import numpy as np

In [3]:
impute_df = pd.read_csv(r"CSV/Imports/o1_X_test_rank1.csv", low_memory=False)

missing_df = pd.read_csv(r"CSV/Imports/o1_X_test.csv", low_memory=False)

In [4]:
# Check if there are any missing values in the whole dataframe
if impute_df.isnull().values.any():
    total_missing = impute_df.isnull().sum().sum()
    print(f"There are {total_missing} missing values in temp_impute_df.")
    
    # Show counts per column (only those with missing values)
    print("\nMissing values per column:")
    print(impute_df.isnull().sum()[impute_df.isnull().sum() > 0])
else:
    print("No missing values found in temp_impute_df.")

No missing values found in temp_impute_df.


In [5]:
temp_impute_df = impute_df[['age']]

temp_missing_df = missing_df[['age']]

#temp_impute_df = temp_impute_df.copy()
#temp_impute_df['age'] = temp_impute_df['age'].astype(float)

In [6]:
# Compare with tolerance
mask_diff = ~np.isclose(
    temp_missing_df['age'],
    temp_impute_df['age'],
    rtol=1e-5,
    atol=1e-8
)

# If there are differences, print them
if mask_diff.any():
    print(f"{mask_diff.sum()} rows fail the np.allclose check for 'age':\n")
    for idx in temp_missing_df.index[mask_diff]:
        original_val = temp_missing_df.at[idx, 'age']
        imputed_val = temp_impute_df.at[idx, 'age']
        print(f"Row {idx}: original={original_val}, imputed={imputed_val}, "
              f"diff={abs(original_val - imputed_val)}")
else:
    print("All 'age' values match within the given tolerance.")

All 'age' values match within the given tolerance.


In [7]:
mask_diff = temp_missing_df['age'] != temp_impute_df['age']

if mask_diff.any():
    print(f"{mask_diff.sum()} rows differ exactly in 'age':\n")
    for idx in temp_missing_df.index[mask_diff]:
        print(f"Row {idx}: original={temp_missing_df.at[idx, 'age']}, "
              f"imputed={temp_impute_df.at[idx, 'age']}")
else:
    print("All 'age' values match exactly.")

comparison_df = pd.DataFrame({
    'original_age': temp_missing_df['age'],
    'imputed_age': temp_impute_df['age']
})



comparison_df = pd.DataFrame({
    'original_age': temp_missing_df['age'],
    'imputed_age': temp_impute_df['age']
})
display(comparison_df)

All 'age' values match exactly.


Unnamed: 0,original_age,imputed_age
0,38,38.0
1,38,38.0
2,38,38.0
3,38,38.0
4,38,38.0
...,...,...
15307,66,66.0
15308,66,66.0
15309,66,66.0
15310,66,66.0


In [8]:
# Align dataframes on common columns and index
common_cols = missing_df.columns.intersection(impute_df.columns)
missing_al = missing_df[common_cols].copy()
impute_al  = impute_df[common_cols].copy()

# Mask: only cells that are non-NaN in the original missing_df
non_missing_mask = missing_al.notna()

# Prepare an empty mask for differences
diff_mask = pd.DataFrame(False, index=missing_al.index, columns=missing_al.columns)

# Split numeric vs non-numeric columns
num_cols = missing_al.select_dtypes(include=[np.number]).columns
non_num_cols = [c for c in missing_al.columns if c not in num_cols]

# --- Numeric columns: use tolerance ---
for c in num_cols:
    a = missing_al[c]
    b = impute_al[c]
    mask = non_missing_mask[c]
    # np.isclose: True if close -> so we invert for "different"
    diff_mask.loc[mask, c] = ~np.isclose(
        a[mask].to_numpy(),
        b[mask].to_numpy(),
        rtol=1e-07, atol=1e-08, equal_nan=True
    )

# --- Non-numeric columns: exact match ---
for c in non_num_cols:
    a = missing_al[c]
    b = impute_al[c]
    mask = non_missing_mask[c]
    diff_mask.loc[mask, c] = (a[mask] != b[mask])

# Build long table of differences
rows, cols = np.where(diff_mask.values)
records = []
for r, c in zip(rows, cols):
    records.append({
        "row": missing_al.index[r],
        "column": missing_al.columns[c],
        "original": missing_al.iat[r, c],
        "imputed": impute_al.iat[r, c]
    })

diff_df = pd.DataFrame(records)

print(f"Found {len(diff_df)} cells that were non-NaN in missing_df but changed in impute_df "
      f"(with tolerance).")
display(diff_df.head(50))

Found 5 cells that were non-NaN in missing_df but changed in impute_df (with tolerance).


Unnamed: 0,row,column,original,imputed
0,698,Pain_Level_(Mean),4.071429,4.071429
1,12033,GCS_(Mean),8.142857,8.142858
2,12496,Respiratory_Rate_(insp/min)_(Mean),8.142857,8.142858
3,13571,Heart_Rate_(bpm)_(Mean),68.292683,68.29269
4,13574,Heart_Rate_(bpm)_(Mean),68.292683,68.29269
