In [None]:
import os
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
def load_models(models_folder="Models"):
    """Load all saved models from the specified folder."""
    models = {}
    for file_name in os.listdir(models_folder):
        if file_name.endswith(".joblib"):
            # Extract model name and cancer type from the filename
            model_name, cancer_type = file_name.split("_")
            model_path = os.path.join(models_folder, file_name)
            models[(model_name, cancer_type)] = joblib.load(model_path)
            print(f"Loaded model: {model_name} for cancer type: {cancer_type}")
    return models

In [None]:
def preprocess_data(new_data_path, scaler=None):
    """Load and preprocess new data."""
    # Load the data
    data = pd.read_csv(new_data_path)
    
    # Drop unnecessary columns (adjust this based on your dataset structure)
    X = data.drop(['cancer_type', 'type'], axis=1, errors='ignore')
    
    # Standardize the data
    if scaler is None:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = scaler.transform(X)
    
    return X_scaled, data

In [None]:
def run_inference(models, test_data_path, scalers_folder="Scalers"):
    # Load the test data
    test_df = pd.read_csv(test_data_path)
    test_features = test_df.drop(['cancer_type', 'type'], axis=1)  # Drop label columns if they exist

    # List all scaler files in the Scalers folder
    scaler_files = [f for f in os.listdir(scalers_folder) if f.endswith('.joblib')]
    
    results = []
    
    for index, row in test_df.iterrows():
        row_predictions = {}  # Store model predictions and their confidence
        
        for (model_name, cancer_type), model in models.items():
            # Find the scaler corresponding to the cancer type
            cancer_type, _ = cancer_type.split(".")
            scaler_filename = f"{cancer_type}_scaler.joblib"
            if scaler_filename in scaler_files:
                scaler_path = os.path.join(scalers_folder, scaler_filename)
                scaler = joblib.load(scaler_path)
                test_features_scaled = scaler.transform([test_features.iloc[index]])  # Transform a single row
                
                # Run inference with the model
                probabilities = model.predict_proba(test_features_scaled)
                confidence = probabilities[0][1]
                
                # Store the model's prediction and confidence
                row_predictions[f"{model_name}-{cancer_type}"] = {
                    'cancer_type': cancer_type,
                    'predicted_class': model.predict(test_features_scaled)[0],
                    'confidence': confidence,
                    'probabilities': probabilities[0].tolist()
                }
            else:
                print(f"couldn't find {cancer_type}_scaler.joblib")

        # Determine the final prediction based on the highest confidence
        final_prediction = "normal"
        max_confidence = -1
        
        for model_name, prediction_info in row_predictions.items():
            if prediction_info['confidence'] > max_confidence and prediction_info['confidence'] > 0.5:
                max_confidence = prediction_info['confidence']
                final_prediction = prediction_info['cancer_type']

        # Append the final prediction for the current data point
        results.append({
            'index': index,
            'cancer_type': final_prediction,
            'confidence': max_confidence if final_prediction is not "normal" else -1.0,
            'predictions': row_predictions
        })

    return results


In [None]:
def save_inference_results(results, output_file="inference_results.csv"):
    # Convert the results to a DataFrame
    df_results = pd.DataFrame(results)
    df_results.to_csv(output_file, index=False)
    print(f"Inference results saved to {output_file}")



In [None]:
# Run the updated function
models_folder = "Models"
new_data_path = "TestDataset/test_data.csv"

# Load models
models = load_models(models_folder=models_folder)


# Run inference
inference_results = run_inference(models, new_data_path)

# Save results
save_inference_results(inference_results, output_file="inference_results.csv")

In [39]:
test_df = pd.read_csv("TestDataset/test_data.csv")

predictions_df = pd.read_csv("inference_results.csv")


accuracy = (predictions_df["cancer_type"] == test_df["cancer_type"]).mean()
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 88.24%
