In [2]:
import sys  
sys.path.insert(1, '/Users/asifahmed/Documents/Codes/MyRecourseProject')

from models.model_trainer import ModelTrainer
from evaluation.evaluator import Evaluator
from visualization.visualizer import Visualizer
from data_handling.crdit_data import Credit
from explainability.explainer_factory import get_explainer
import pandas as pd
import numpy as np
from data_handling.dataset import Dataset

data_instance = Dataset(target_column='D')
data_instance.load_csv('/Users/asifahmed/Documents/Codes/MyRecourseProject/synthetic_data/out/R1.csv')
data_instance.encode_categorical_columns()
data_instance.remove_outliers()
data_instance.balanced_sample(300)
data_instance.select_features(['D', 'age', 'X2'])

trainer = ModelTrainer()
X_train, X_test, y_train, y_test = trainer.split_data(data_instance.data, target_column='D')
X_train_scaled, X_test_scaled = trainer.scale_features(X_train, X_test)
model = trainer.train('logistic_regression', X_train_scaled, y_train)
evaluator = Evaluator(model, X_test_scaled, y_test)
evaluator.report()

# feature_names = ('Age', 'X2')
# visualizer = Visualizer(model, X_test_scaled, y_test, X_original=X_test)
# visualizer.plot_decision_boundary_interactive(feature_names=feature_names)

CSV file loaded successfully with delimiter: ','
Removed 70 outliers. New dataset size: 4930
Selected features are now active: ['D', 'age', 'X2']
Data split into train and test sets.
logistic_regression model trained successfully.
Accuracy: 0.5083333333333333
Precision: 0.5079365079365079
Recall: 0.5333333333333333
F1 Score: 0.5203252032520326
Confusion Matrix:
 [[29 31]
 [28 32]]
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.48      0.50        60
           1       0.51      0.53      0.52        60

    accuracy                           0.51       120
   macro avg       0.51      0.51      0.51       120
weighted avg       0.51      0.51      0.51       120



In [3]:
y_pred = model.predict(X_test_scaled)  # Use scaled data for prediction

# Find misclassifications
misclassified = y_test != y_pred
misclassified_indices = np.where(misclassified)[0]

# Filter misclassified samples using .iloc for correct row indexing
X_misclassified = X_test_scaled[misclassified_indices]  # Scaled test features for misclassified instances
y_misclassified = y_test.iloc[misclassified_indices]  # True labels for misclassified instances

feature_names = ('Age', 'X2')
visualizer = Visualizer(model, X_misclassified, y_misclassified, X_original=X_test.iloc[misclassified_indices])
visualizer.plot_decision_boundary_interactive(feature_names=feature_names)

In [4]:
# Calculate distance for a specific test point
test_point = X_test_scaled[misclassified_indices[2]]  # Example test point
print(X_test.iloc[misclassified_indices[2]])
distance = trainer.distance_to_hyperplane(test_point)
print("Distance to the decision boundary:", distance)

age    42.588712
X2      0.155000
Name: 4553, dtype: float64
Distance to the decision boundary: 0.5301639610992255


In [5]:
# Prediction and identification of misclassified points
y_pred = model.predict(X_test_scaled)
misclassified = y_test != y_pred
misclassified_indices = np.where(misclassified)[0]

# Filter misclassified samples
X_misclassified_scaled = X_test_scaled[misclassified_indices]
y_misclassified = y_test.iloc[misclassified_indices]

# Find indices of class 0 misclassified points within the misclassified array
class_0_misclassified_indices = np.where(y_misclassified == 0)[0]  # This will be relative to y_misclassified

# Now use these relative indices to select from X_misclassified_scaled
X_class_0_misclassified_scaled = X_misclassified_scaled[class_0_misclassified_indices]

# Calculate distances to the decision boundary for class 0 misclassified points
distances = []
for idx in class_0_misclassified_indices:
    original_index = misclassified_indices[idx]  # Get the original index of the test sample
    distance = trainer.distance_to_hyperplane(X_test_scaled[original_index])
    distances.append(distance)
    print(f"Distance for test sample index {original_index}: {distance}")

# Calculate the average distance
if distances:
    average_distance = np.mean(distances)
    print(f"Average distance to the decision boundary for class 0 misclassified points: {average_distance}")
else:
    print("No class 0 misclassified points to calculate distance.")

Distance for test sample index 0: 1.2035318959077668
Distance for test sample index 2: 0.12542107097379307
Distance for test sample index 6: 1.0041188127997596
Distance for test sample index 8: 0.9500465585516884
Distance for test sample index 10: 0.40509591598488587
Distance for test sample index 12: 1.1338949570445633
Distance for test sample index 13: 0.19839332571624924
Distance for test sample index 16: 0.9774921155992353
Distance for test sample index 18: 0.23982325269928126
Distance for test sample index 22: 1.9043002202008847
Distance for test sample index 23: 1.626073793411014
Distance for test sample index 24: 0.6829724349681369
Distance for test sample index 26: 0.20231545815133387
Distance for test sample index 28: 0.7640102509057096
Distance for test sample index 31: 0.6499383760773305
Distance for test sample index 32: 0.18207744397343953
Distance for test sample index 33: 0.6408686076070573
Distance for test sample index 34: 0.9886483515756695
Distance for test sample in

In [None]:
# Generate explanations using DICE
continuous_features = ['A2', 'A14']
explainer = get_explainer('dice', model=model, data=data_instance.data,
                          continuous_features=continuous_features,
                          outcome_name='A15')
query_instance = trainer.X_test.iloc[0:5]  # Selecting the first test instance
counterfactuals = explainer.generate_explanation(query_instance, total_CFs=5)

In [None]:

import sys  
sys.path.insert(1, '/Users/asifahmed/Documents/Codes/MyRecourseProject')

import pandas as pd
import itertools
from models.model_trainer import ModelTrainer
from data_handling.dataset import Dataset
from evaluation.evaluator import Evaluator

def is_numerical(data, column, target_column, threshold=20):
    """
    Consider a column numerical if it's not the target and has more than `threshold` unique values.
    """
    return data[column].nunique() > threshold and column != target_column


def automated_evaluation(file_path, target_column, model_type='svm', threshold=0.6, sample_size=300):
    # Initialize the model trainer
    trainer = ModelTrainer()

    dataset_name = file_path.split("/")[-1]
     # Initialize results list
    passed_pairs = []

    # Get all truly numerical columns from the original data
    original_data_instance = Dataset(target_column=target_column)
    original_data_instance.load_csv(file_path=file_path)
    original_data_instance.encode_categorical_columns()
    # original_data_instance.remove_outliers()
    original_data_instance.balanced_sample(sample_size)

    numerical_columns = [col for col in original_data_instance.data.columns if is_numerical(original_data_instance.data, col, target_column)]

    # Iterate over all pairs of numerical features
    for feature1, feature2 in itertools.combinations(numerical_columns, 2):
        # Reload data instance for each pair
        data_instance = Dataset(target_column=target_column)
        data_instance.load_csv(file_path)
        data_instance.encode_categorical_columns()
        # data_instance.remove_outliers()
        data_instance.balanced_sample(sample_size)

        # Select features and ensure they are present in the dataset
        data_instance.select_features([feature1, feature2, target_column])
        
        # if data_instance.data.empty:
        #     continue
        
        # Split and scale the data
        X_train, X_test, y_train, y_test = trainer.split_data(data_instance.data, target_column=target_column)

        if X_train.empty or X_test.empty:
            continue

        X_train_scaled, X_test_scaled = trainer.scale_features(X_train, X_test)
        
        # Train the model
        model = trainer.train(model_type, X_train_scaled, y_train)
        
        # Create an evaluator with the trained model and test data
        evaluator = Evaluator(model, X_test_scaled, y_test)
        
        # Obtain metrics using the Evaluator methods
        metrics = evaluator.get_evaluation_metrics()

        # Print necessary information if the threshold is met
        if metrics['Accuracy'] and metrics['Accuracy'] >= threshold:
            print(f"\033[1;32mPassed: {feature1}, {feature2} with accuracy {metrics['Accuracy']:.2f}\033[0m")
            evaluator.report()
            passed_pairs.append({
                'Dataset': dataset_name,
                'Feature1': feature1, 
                'Feature2': feature2,
                'Accuracy': metrics['Accuracy'],
                'Precision': metrics['Precision'], 
                'Recall': metrics['Recall'], 
                'F1 Score': metrics['F1 Score'],
                'Confusion Matrix': metrics['Confusion Matrix'].tolist(),
                'Classification Report': metrics['Classification Report']

                
            })
        else:
            # Print necessary information for all pairs
            print(f"Failed: {feature1}, {feature2} with accuracy {metrics['Accuracy']:.2f}")

        # Add space between iterations for readability
        print("\n" + "-" * 50 + "\n")

# Save passed pairs information to a text file]
    print("Saving passed pairs information to 'evaluation_results.txt'...")
    with open('evaluation_results.txt', 'w') as file:
        for pair in passed_pairs:
            file.write(f"Dataset: {pair['Dataset']}\n")
            file.write(f"Features: {pair['Feature1']}, {pair['Feature2']}\n")
            file.write(f"Accuracy: {pair['Accuracy']:.2f}\n")
            file.write(f"Precision: {pair['Precision']:.2f}\n")
            file.write(f"Recall: {pair['Recall']:.2f}\n")
            file.write(f"F1 Score: {pair['F1 Score']:.2f}\n")
            file.write(f"Confusion Matrix: {pair['Confusion Matrix']}\n")
            file.write("Classification Report:\n")
            file.write(f"{pair['Classification Report']}\n")
            file.write("-" * 50 + "\n")

automated_evaluation('/Users/asifahmed/Documents/Codes/MyRecourseProject/datasets/processed/credit_processed.csv', 
                     target_column='NoDefaultNextMonth',
                     threshold=.65,
                     sample_size=1000)