In [1]:
import pandas as pd
import os
from calculate_all_features_drift import calculate_all_features_drift
from calculate_feature_drift import calculate_feature_drift


# Directory to save test results
os.makedirs("test_results", exist_ok=True)


In [2]:
# ------------------------------
# TEST 1: Standard domain binning test
# ------------------------------
reference_df_1 = pd.DataFrame({
    'age': [23, 34, 45, 52, 29, 41, 36, 48, 51, 26],
    'education': ['Bachelors', 'HS-grad', 'Masters', 'Some-college', 'HS-grad',
                  'Masters', 'Bachelors', 'HS-grad', 'Some-college', 'Assoc-acdm']
})
new_df_1 = pd.DataFrame({
    'age': [24, 32, 46, 54, 28, 42, 39, 50, 53, 27],
    'education': ['HS-grad', 'Bachelors', 'Some-college', 'Masters', 'HS-grad',
                  'Bachelors', 'Assoc-voc', 'HS-grad', 'Some-college', 'Masters']
})
domain_bins = {
    'age': [0, 25, 35, 45, 55, 100],
    'education': ['Bachelors', 'HS-grad', 'Masters', 'Some-college', 'Assoc-acdm', 'Assoc-voc']
}
csi_df_1, details_1 = calculate_all_features_drift(reference_df_1, new_df_1, bins=domain_bins, method='domain')
csi_df_1.to_csv("test_results/CSI_summary_test1.csv", index=False)
for feat, df in details_1.items():
    df.to_csv(f"test_results/{feat}_details_test1.csv")



TypeError: '<' not supported between instances of 'int' and 'dict'

In [None]:
# ------------------------------
# TEST 2: New data missing some bins
# ------------------------------
new_df_2 = new_df_1.copy()
new_df_2['education'] = ['HS-grad', 'HS-grad', 'HS-grad', 'HS-grad', 'HS-grad',
                         'HS-grad', 'HS-grad', 'HS-grad', 'HS-grad', 'HS-grad']  # No other categories
csi_df_2, details_2 = calculate_all_features_drift(reference_df_1, new_df_2, bins=domain_bins, method='domain')
csi_df_2.to_csv("test_results/CSI_summary_test2_missing_bins.csv", index=False)
for feat, df in details_2.items():
    df.to_csv(f"test_results/{feat}_details_test2.csv")

# ------------------------------
# TEST 3: Completely stable data (identical distributions)
# ------------------------------
new_df_3 = reference_df_1.copy()
csi_df_3, details_3 = calculate_all_features_drift(reference_df_1, new_df_3, bins=domain_bins, method='domain')
csi_df_3.to_csv("test_results/CSI_summary_test3_identical.csv", index=False)
for feat, df in details_3.items():
    df.to_csv(f"test_results/{feat}_details_test3.csv")

In [None]:
# ------------------------------
# TEST 4: Drift in age only
# ------------------------------
new_df_4 = reference_df_1.copy()
new_df_4['age'] = [90, 85, 80, 75, 70, 65, 60, 95, 100, 88]  # Older population
csi_df_4, details_4 = calculate_all_features_drift(reference_df_1, new_df_4, bins=domain_bins, method='domain')
csi_df_4.to_csv("test_results/CSI_summary_test4_drift_age.csv", index=False)
for feat, df in details_4.items():
    df.to_csv(f"test_results/{feat}_details_test4.csv")

# ------------------------------
# TEST 5: Unseen categories in new data (edge case)
# ------------------------------
new_df_5 = new_df_1.copy()
new_df_5.loc[0, 'education'] = 'PhD'  # Category not in domain
csi_df_5, details_5 = calculate_all_features_drift(reference_df_1, new_df_5, bins=domain_bins, method='domain')
csi_df_5.to_csv("test_results/CSI_summary_test5_unseen_category.csv", index=False)
for feat, df in details_5.items():
    df.to_csv(f"test_results/{feat}_details_test5.csv")

print("✅ All test outputs saved to 'test_results/' folder.")
