In [1]:
import sys  
sys.path.insert(1, '/Users/asifahmed/Documents/Codes/MyRecourseProject')

from models.model_trainer import ModelTrainer
from evaluation.evaluator import Evaluator
from visualization.visualizer import Visualizer
from data_handling.crdit_data import Credit
from explainability.explainer_factory import get_explainer
import pandas as pd
import numpy as np
from data_handling.dataset import Dataset

data_instance = Dataset(target_column='D')
data_instance.load_csv('/Users/asifahmed/Documents/Codes/MyRecourseProject/synthetic_data/out/R1.csv')
data_instance.encode_categorical_columns()
data_instance.remove_outliers()
data_instance.balanced_sample(300)
data_instance.select_features(['D', 'age', 'X2'])

trainer = ModelTrainer()
X_train, X_test, y_train, y_test = trainer.split_data(data_instance.data, target_column='D')
X_train_scaled, X_test_scaled = trainer.scale_features(X_train, X_test)
model = trainer.train('logistic_regression', X_train_scaled, y_train)
evaluator = Evaluator(model, X_test_scaled, y_test)
evaluator.report()

# feature_names = ('Age', 'X2')
# visualizer = Visualizer(model, X_test_scaled, y_test, X_original=X_test)
# visualizer.plot_decision_boundary_interactive(feature_names=feature_names)

CSV file loaded successfully with delimiter: ','
Encoded G
Encoded job
Encoded D
Removed 70 outliers. New dataset size: 4930
Selected features are now active: ['D', 'age', 'X2']
Data split into train and test sets.
logistic_regression model trained successfully.
Accuracy: 0.5416666666666666
Precision: 0.5689655172413793
Recall: 0.5238095238095238
F1 Score: 0.5454545454545454
Confusion Matrix:
 [[32 25]
 [30 33]]
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.56      0.54        57
           1       0.57      0.52      0.55        63

    accuracy                           0.54       120
   macro avg       0.54      0.54      0.54       120
weighted avg       0.54      0.54      0.54       120



In [4]:
y_pred = model.predict(X_test_scaled)  # Use scaled data for prediction

# Find misclassifications
misclassified = y_test != y_pred
misclassified_indices = np.where(misclassified)[0]

# Filter misclassified samples using .iloc for correct row indexing
X_misclassified = X_test_scaled[misclassified_indices]  # Scaled test features for misclassified instances
y_misclassified = y_test.iloc[misclassified_indices]  # True labels for misclassified instances

feature_names = ('Age', 'X2')
visualizer = Visualizer(model, X_misclassified, y_misclassified, X_original=X_test.iloc[misclassified_indices])
visualizer.plot_decision_boundary_interactive(feature_names=feature_names)

In [8]:
# Calculate distance for a specific test point
test_point = X_test_scaled[misclassified_indices[2]]  # Example test point
print(X_test.iloc[misclassified_indices[2]])
distance = trainer.distance_to_hyperplane(test_point)
print("Distance to the decision boundary:", distance)

age    41.798764
X2      1.678000
Name: 2277, dtype: float64
Distance to the decision boundary: 1.515105985032788


In [3]:
# Prediction and identification of misclassified points
y_pred = model.predict(X_test_scaled)
misclassified = y_test != y_pred
misclassified_indices = np.where(misclassified)[0]

# Filter misclassified samples
X_misclassified_scaled = X_test_scaled[misclassified_indices]
y_misclassified = y_test.iloc[misclassified_indices]

# Find indices of class 0 misclassified points within the misclassified array
class_0_misclassified_indices = np.where(y_misclassified == 0)[0]  # This will be relative to y_misclassified

# Now use these relative indices to select from X_misclassified_scaled
X_class_0_misclassified_scaled = X_misclassified_scaled[class_0_misclassified_indices]

# Calculate distances to the decision boundary for class 0 misclassified points
distances = []
for idx in class_0_misclassified_indices:
    original_index = misclassified_indices[idx]  # Get the original index of the test sample
    distance = trainer.distance_to_hyperplane(X_test_scaled[original_index])
    distances.append(distance)
    print(f"Distance for test sample index {original_index}: {distance}")

# Calculate the average distance
if distances:
    average_distance = np.mean(distances)
    print(f"Average distance to the decision boundary for class 0 misclassified points: {average_distance}")
else:
    print("No class 0 misclassified points to calculate distance.")

Distance for test sample index 1: 1.1181615399546132
Distance for test sample index 2: 2.2596008254440605
Distance for test sample index 6: 0.5125539899801671
Distance for test sample index 7: 1.71245089904298
Distance for test sample index 12: 0.7337416385119664
Distance for test sample index 19: 0.14445693736995197
Distance for test sample index 21: 1.703425414026684
Distance for test sample index 25: 1.0422712229214635
Distance for test sample index 37: 0.883433229886918
Distance for test sample index 45: 0.2863288087411626
Distance for test sample index 46: 0.814778211741922
Distance for test sample index 47: 0.7481851615095548
Distance for test sample index 53: 0.196671751207442
Distance for test sample index 58: 0.6369043492502922
Distance for test sample index 60: 1.1321523560844347
Distance for test sample index 65: 0.7673055598141791
Distance for test sample index 68: 0.5930044291658708
Distance for test sample index 74: 0.21906174726461008
Distance for test sample index 93: 0

In [None]:
# Generate explanations using DICE
continuous_features = ['A2', 'A14']
explainer = get_explainer('dice', model=model, data=data_instance.data,
                          continuous_features=continuous_features,
                          outcome_name='A15')
query_instance = trainer.X_test.iloc[0:5]  # Selecting the first test instance
counterfactuals = explainer.generate_explanation(query_instance, total_CFs=5)