# Model Evaluation

Evaluate a trained LSTM model on holdout data with baseline comparison and feature importance analysis.


In [1]:
import sys
import os
import warnings

sys.path.insert(0, os.path.dirname(os.getcwd()))

import numpy as np
import pandas as pd
import joblib

from storage import ModelPersistence
from evaluation import ModelEvaluator, FeatureImportanceAnalyzer

warnings.filterwarnings("ignore")


  if not hasattr(np, "object"):


## 1. Configuration


In [None]:
MODELS_DIR = os.path.join(os.getcwd(), "..", "models_lstm")
DATA_PATH = os.path.join(os.getcwd(), "..", "data", "processed", "dp_split_datasets_full_simple.joblib")

DECISION_POINT = "DP 1"  # Change this to evaluate different decision points


## 2. List Available Models


In [3]:
available = [d.replace("_", " ") for d in os.listdir(MODELS_DIR) if os.path.isdir(os.path.join(MODELS_DIR, d))]
print(f"Available models: {len(available)}")
print(", ".join(sorted(available, key=lambda x: int(x.split()[1]))))


Available models: 43
DP 1, DP 3, DP 4, DP 5, DP 7, DP 8, DP 9, DP 10, DP 11, DP 12, DP 13, DP 14, DP 15, DP 16, DP 17, DP 19, DP 20, DP 21, DP 22, DP 23, DP 25, DP 26, DP 27, DP 28, DP 29, DP 30, DP 31, DP 32, DP 33, DP 34, DP 35, DP 36, DP 37, DP 39, DP 40, DP 41, DP 42, DP 43, DP 44, DP 46, DP 47, DP 48, DP 49


## 3. Load Model and Data


In [4]:
model_path = os.path.join(MODELS_DIR, DECISION_POINT.replace(" ", "_"))
bundle = ModelPersistence.load(model_path)

print(f"Loaded model for {DECISION_POINT}")
print(f"  Activities: {len(bundle['activity_encoder'].classes_)}")
print(f"  Resources: {len(bundle['resource_encoder'].classes_)}")
print(f"  Classes: {list(bundle['label_encoder'].classes_)}")
print(f"  Max sequence length: {bundle['max_seq_len']}")


Loaded model for DP 1
  Activities: 2
  Resources: 112
  Classes: ['A_Concept', 'A_Submitted', 'W_Complete application']
  Max sequence length: 1


In [6]:
splits = joblib.load(DATA_PATH)
df_holdout = splits[DECISION_POINT]["holdout"]

print(f"Holdout set: {len(df_holdout)} samples")
print(f"Label distribution:")
print(df_holdout["label"].value_counts())


EOFError: Ran out of input

## 4. Evaluate Model


In [None]:
evaluator = ModelEvaluator(bundle)
X_test, y_test = evaluator.prepare_holdout(df_holdout)

report = evaluator.evaluate(X_test, y_test, print_report=True)


## 5. Compare with Baseline


In [None]:
print(f"\n=== Comparison for {DECISION_POINT} ===")
comparison = evaluator.compare_with_baseline(df_holdout, print_result=True)


## 6. Feature Importance


In [None]:
analyzer = FeatureImportanceAnalyzer(bundle["model"])

importances, feature_names, baseline_acc = analyzer.all_features_importance(
    X_test, y_test,
    context_feature_names=bundle["context_keys"],
    n_repeats=3
)

print(f"\nBaseline accuracy: {baseline_acc:.3f}")
print("\nFeature importances:")
for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1]):
    print(f"  {name}: {imp:.4f}")


In [None]:
FeatureImportanceAnalyzer.plot(
    feature_names, importances,
    title=f"Feature Importance - {DECISION_POINT}"
)


## 7. Evaluate All Models


In [None]:
results = []

for dp in splits.keys():
    dp_path = os.path.join(MODELS_DIR, dp.replace(" ", "_"))
    if not os.path.exists(dp_path):
        continue
    
    bundle = ModelPersistence.load(dp_path)
    df = splits[dp]["holdout"]
    
    if df.empty:
        continue
    
    evaluator = ModelEvaluator(bundle)
    comparison = evaluator.compare_with_baseline(df, print_result=False)
    
    if comparison:
        results.append({
            "decision_point": dp,
            "f1_lstm": comparison["f1_lstm"],
            "f1_baseline": comparison["f1_baseline"],
            "improvement": comparison["relative_improvement"]
        })

df_results = pd.DataFrame(results)
df_results = df_results.sort_values("improvement", ascending=False)

print(f"\n=== All Models Summary ===")
print(f"Average F1 (LSTM): {df_results['f1_lstm'].mean():.3f}")
print(f"Average F1 (Baseline): {df_results['f1_baseline'].mean():.3f}")
print(f"Average Improvement: {df_results['improvement'].mean():.2f}%")
print("\nTop 5 improved:")
print(df_results.head())
