In [11]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import numpy as np

# Load the new dataset
new_dataset_path = "Test_Dataset.csv"  # Replace with your new dataset path
new_data = pd.read_csv(new_dataset_path)

# Load the trained model, scaler, and encoders
model = joblib.load("random_forest_model.pkl")
scaler = joblib.load("scaler.pkl")
label_encoders = joblib.load("label_encoders.pkl")

# Preprocess the new dataset
categorical_features = ['Gender', 'Blood Type', 'Medical Condition', 'Medication']
for feature in categorical_features:
    encoder = label_encoders[feature]
    if 'Unknown' in encoder.classes_:
        new_data[feature] = new_data[feature].map(
            lambda x: encoder.transform([x])[0] if np.isin(x, encoder.classes_) else encoder.transform(['Unknown'])[0]
        )
    else:
        # Map unseen values to the most frequent class (fallback)
        most_frequent_class = encoder.transform([encoder.classes_[0]])[0]
        new_data[feature] = new_data[feature].map(
            lambda x: encoder.transform([x])[0] if np.isin(x, encoder.classes_) else most_frequent_class
        )

# Validate processed data
print(f"Rows before preprocessing: {len(new_data)}")
new_data = new_data.dropna()  # Drop rows with nulls in non-categorical fields
print(f"Rows after preprocessing: {len(new_data)}")

# Prepare features
features = ['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Medication']
X_new = new_data[features]

# Encode the target if it exists in the new dataset
if 'Test Results' in new_data.columns:
    y_new = new_data['Test Results']
    # Map target values to numeric labels
    valid_targets = label_encoders['Test Results'].classes_
    y_new = y_new.map(
        lambda x: label_encoders['Test Results'].transform([x])[0] if np.isin(x, valid_targets) else None
    )
    # Drop rows with invalid target labels (None)
    new_data = new_data.dropna(subset=['Test Results'])
    # Convert valid target labels to numeric format
    y_new = label_encoders['Test Results'].transform(new_data['Test Results'])
else:
    y_new = None  # Ground truth not available

# Scale numerical features
if len(X_new) > 0:  # Ensure dataset is not empty
    X_new_scaled = scaler.transform(X_new)

    # Predict
    y_pred = model.predict(X_new_scaled)

    if y_new is not None:
        # Evaluate the model if ground truth is available
        print("Classification Report:")
        print(classification_report(y_new, y_pred, target_names=[str(cls) for cls in label_encoders['Test Results'].classes_]))
        print(f"Accuracy: {accuracy_score(y_new, y_pred):.2f}")

        # Confusion Matrix
        cm = confusion_matrix(y_new, y_pred)
        print("Confusion Matrix:")
        print(cm)
    else:
        # Print predictions if ground truth is not available
        print("Predictions for the new dataset:")
        print(pd.DataFrame({'Predicted': label_encoders['Test Results'].inverse_transform(y_pred)}))
else:
    print("No valid rows in the new dataset after preprocessing.")


Rows before preprocessing: 1000
Rows after preprocessing: 1000
Classification Report:
              precision    recall  f1-score   support

    Abnormal       0.32      0.19      0.24       319
Inconclusive       0.29      0.09      0.14       360
      Normal       0.33      0.72      0.45       321

    accuracy                           0.32      1000
   macro avg       0.31      0.33      0.28      1000
weighted avg       0.31      0.32      0.27      1000

Accuracy: 0.32
Confusion Matrix:
[[ 60  39 220]
 [ 76  33 251]
 [ 51  40 230]]
