# Import the Require Libraries

In [1]:
import pandas as pd

# Load the Data

In [3]:
credit_data = pd.read_csv('GermanCredit.csv')
credit_data = credit_data.drop(columns=['Unnamed: 0'])

# Split the Data

# Load the Inference Pipeline

# Re-create Inference Pipeline

In [None]:
# Example: Separate target for fitting if not already done
target = 'default'
if target in X_train_full.columns:
    y_train = X_train_full[target]
    X_train_full_features = X_train_full.drop(columns=[target])
else:
    X_train_full_features = X_train_full # Assume target is already separate

# --- Train the Pipeline ---
print("\n--- Starting Pipeline Training ---")
training_pipeline.fit(X_train_full_features, y_train_full) # Pass target `y` if needed by fit methods
print("--- Pipeline Training Complete ---")

# --- Save the Fitted Pipeline ---
pipeline_filename = 'full_credit_pipeline.joblib'
pipeline_path = f"{model_folder}/{pipeline_filename}"
joblib.dump(training_pipeline, pipeline_path)
print(f"Fitted pipeline saved to {pipeline_filename}")

In [None]:
import pandas as pd
import joblib # Or however you load your pipeline

# --- Assume these are loaded or available ---
# loaded_pipeline: Your full, fitted training pipeline object (including the classifier)
# X_test: Your raw validation features DataFrame (the input BEFORE any processing)
# y_validation_true: Your true validation labels

# --- Obtain X_val_processed ---
try:
    # 1. Select all steps EXCEPT the last one (the classifier)
    #    The slicing `[:-1]` creates a temporary pipeline with only the transformer steps.
    preprocessing_pipeline = loaded_pipeline[:-1]

    # 2. Apply the transform methods of all preprocessing steps to the raw validation data
    print("Applying preprocessing steps to validation data...")
    X_val_processed = preprocessing_pipeline.transform(X_test)
    print("Preprocessing complete. X_val_processed generated.")

    # 3. (Optional but Recommended) Get Feature Names after transformation
    #    This is often needed for SHAP plots or detailed analysis.
    #    Requires the last step of the preprocessing pipeline to support get_feature_names_out
    #    (like ColumnTransformer or OneHotEncoder with recent sklearn).
    feature_names_processed = None
    try:
        # Access the last step of the *preprocessing part* of the pipeline
        last_preprocessor_step_name = preprocessing_pipeline.steps[-1][0] # Get name of last step
        last_preprocessor_object = preprocessing_pipeline.named_steps[last_preprocessor_step_name]

        if hasattr(last_preprocessor_object, 'get_feature_names_out'):
            feature_names_processed = last_preprocessor_object.get_feature_names_out()
            print(f"Successfully retrieved {len(feature_names_processed)} feature names after processing.")
        else:
             # If the last step doesn't have it, try converting the output to a DataFrame
             # This might work if the previous steps preserved DataFrame structure
             if isinstance(X_val_processed, pd.DataFrame):
                 feature_names_processed = X_val_processed.columns.tolist()
                 print("Retrieved feature names from DataFrame columns.")
             else:
                 print("Warning: Could not automatically retrieve feature names after processing.")
                 print("The last preprocessing step doesn't have 'get_feature_names_out'.")
                 # You might need to construct them manually based on your pipeline steps.

    except Exception as e:
        print(f"Warning: Error retrieving feature names: {e}")


    # 4. (Optional) Convert X_val_processed to DataFrame if it's a NumPy array
    #    This can make indexing easier in the analysis function, but uses more memory.
    if not isinstance(X_val_processed, pd.DataFrame) and feature_names_processed is not None:
        X_val_processed_df = pd.DataFrame(X_val_processed,
                                          columns=feature_names_processed,
                                          index=X_test.index) # Preserve original index!
        print("Converted processed data to DataFrame.")
    elif isinstance(X_val_processed, pd.DataFrame):
        X_val_processed_df = X_val_processed # It's already a DataFrame
        if feature_names_processed is None: # Try to get names if not already found
            feature_names_processed = X_val_processed_df.columns.tolist()
    else:
        # It's likely a NumPy array and feature names couldn't be retrieved
        X_val_processed_df = pd.DataFrame(X_val_processed, index=X_test.index) # Create DF without column names
        print("Warning: Processed data is likely a NumPy array, creating DataFrame without column names.")


    # --- Now you can call the analysis function ---
    print("\nCalling qualitative analysis function...")
    perform_qualitative_model_analysis(
        model=loaded_pipeline.named_steps['xgb_classifier'], # Get model from full pipeline
        X_val_processed=X_val_processed_df, # Pass the processed data (ideally DataFrame)
        y_val=y_test, # True labels for validation set
        X_val_raw=X_test,
        threshold=0.4, # Your chosen threshold
        num_samples_to_show=7,
        class_names=['No Default', 'Default'],
        subgroup_col='job',
        feature_names=feature_names_processed # Pass the retrieved feature names
    )

except AttributeError:
    print("\nError: Could not slice the pipeline. Is it a fitted Scikit-learn Pipeline object?")
except NotFittedError:
     print("\nError: The pipeline must be fitted before calling transform.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")