In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

# Load raw test data
test_df = pd.read_csv("test.csv")

# Drop irrelevant columns (same as training)
columns_to_drop = [
    'Patient Id', 'Patient First Name', 'Family Name', 'Father\'s name',
    'Institute Name', 'Location of Institute', 'Disorder Subclass'
]

test_df = test_df.drop(columns=columns_to_drop, errors='ignore')  # 👈 avoid KeyError


# Fill missing numerical values with median
num_cols = test_df.select_dtypes(include=['float64', 'int']).columns
test_df[num_cols] = test_df[num_cols].fillna(test_df[num_cols].median())

# Fill missing categorical values with mode
cat_cols = test_df.select_dtypes(include=['object']).columns
test_df[cat_cols] = test_df[cat_cols].fillna(test_df[cat_cols].mode().iloc[0])

# Label encoding for categorical columns
# Use the same encoding as training (use joblib if saved, or retrain below)
label_encoders = {}

# If you saved label encoders during training, load them here
# label_encoders = joblib.load("label_encoders.pkl")

# If not saved, create fresh encoders (MUST be identical to training for consistency)
for col in cat_cols:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col])  # Note: This may mismatch if training labels were different
    label_encoders[col] = le

# Save the cleaned test dataset
test_df.to_csv("cleaned_test_data.csv", index=False)
print("✅ Cleaned test dataset saved as cleaned_test_data.csv")


✅ Cleaned test dataset saved as cleaned_test_data.csv
