In [None]:
import numpy as np
import pandas as pd
#https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction
#https://www.kaggle.com/datasets/alexisbcook/synthetic-credit-card-approval
import os
import kagglehub
import warnings

import os
warnings.filterwarnings("ignore")

df = pd.read_csv("../dataset/credit-card-approval/credit_card_approval.csv")
df

In [None]:
df.rename(columns={'TARGET': 'Approved'}, inplace=True)
# Display the first few rows to confirm the change
df.head()

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import colorama
from colorama import Fore

# Automatically identify categorical columns from the dataframe based on data types
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Identify potential categorical columns stored as numbers by checking unique values
for col in df.columns:
    if col != 'Approved':  # Skip 'Approved' column
        if df[col].nunique() < 10 and col not in categorical_cols:  # Arbitrary threshold of < 10 unique values
            categorical_cols.append(col)

# Display identified categorical columns
print(f"Identified Categorical Columns (excluding 'Approved'): {categorical_cols}\n")


In [None]:
# Initialize a list to store bad predictors
bad_predictors = []

# You can change the threshold for what is considered a "bad predictor"
p_value_threshold = 0.05  # Adjust this if needed

# Evaluate each categorical column using chi-squared test
for col in categorical_cols:
    print(f"{col}:")
    
    # Perform chi-squared test
    a = np.array(pd.crosstab(df['Approved'], df[col]))
    stats, p, dof, _ = chi2_contingency(a, correction=False)
    
    if p > p_value_threshold:
        print(Fore.RED + f"'{col}' is a 'bad Predictor'")
        print(f"p_val = {p}\n")
        bad_predictors.append(col)  # Append the bad predictor to the list
    else:
        print(Fore.GREEN + f"'{col}' is a 'Good Predictor'")
        print(f"p_val = {p}\n")

In [None]:
# Automatically drop bad predictors
df.drop(bad_predictors, axis=1, inplace=True)

# Print the final dataframe structure after dropping
print(f"Dropped bad predictors: {bad_predictors}")
df

In [None]:
df.columns = df.columns.str.strip()  # Removes leading and trailing spaces from all column names
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Print columns to verify correctness
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# Remove 'Approved' if it's in either list
if 'Approved' in categorical_cols:
    categorical_cols.remove('Approved')
if 'Approved' in numerical_cols:
    numerical_cols.remove('Approved')

# Check the results
print("Categorical columns after removal:", categorical_cols)
print("Numerical columns after removal:", numerical_cols)

In [None]:
print("Data types of the columns:")
print(df.dtypes)

print("\nCategorical columns identified:")
print(categorical_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Initialize LabelEncoders for categorical variables
label_encoders = {}
for col in categorical_cols:
    if col != 'Approved':  # Exclude the target column from encoding
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Store the encoder for future use (e.g., inverse_transform)

print("Unique values in categorical columns after encoding:")
for col in categorical_cols:
    if col != 'Approved':  # Ensure target is not being encoded
        print(f"{col}: {df[col].unique()}")
print("\nData types of the columns after encoding:")
print(df.dtypes)


In [None]:
df

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame and 'Approved' is the target column

# Select numerical features (excluding 'Approved')
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

# Remove 'Approved' from the list of columns to scale
if 'Approved' in numerical_cols:
    numerical_cols.remove('Approved')

# Scale only the numerical feature columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Separate features (X) and target variable (y)
X = df.drop('Approved', axis=1)  # Drop 'Approved' from features
y = df['Approved']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting splits
print("Training set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)


In [None]:
print(df['Approved'].value_counts())  # Ensure the target has values like 0/1 or yes/no
df

In [None]:
X_train

In [None]:
# Reduce dataset size using stratified sampling (optional)
# X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.1, stratify=y, random_state=42)

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_auc_score, f1_score

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# List of models to evaluate with optimizations
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42, ccp_alpha=0.01, max_depth=6, min_samples_split=20),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=50, max_depth=6, n_jobs=-1, min_samples_split=10, max_features='sqrt'),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000, C=0.5, n_jobs=-1),  # Added regularization with C=0.5
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss', tree_method='gpu_hist', n_estimators=50, max_depth=6, subsample=0.8, colsample_bytree=0.8, reg_lambda=10)  # Added L2 regularization with reg_lambda=10
}

# Dictionary to store evaluation results
results = {}

# Iterate over each model with cross-validation
for model_name, model in models.items():
    # Cross-validate the model
    cv_accuracy = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy').mean()
    cv_f1 = cross_val_score(model, X_train, y_train, cv=kf, scoring='f1').mean()
    cv_auc = cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc').mean()
    cv_mae = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error').mean() * -1  # Convert negative MAE to positive
    
    # Train the model on the full training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_test_pred = model.predict(X_test)
    
    # For AUC-ROC, we need the probability scores for the positive class
    y_test_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Evaluate the model on the test set
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None else None
    
    # Check if any metric is 1.0, and skip such models
    if test_accuracy == 1.0 or test_auc == 1.0 or test_f1 == 1.0 or test_mae == 0.0:
        print(f"{model_name} is overfitting with a metric of 1.0, skipping this model.")
        continue
    
    # Store the evaluation results
    results[model_name] = {
        "CV Accuracy": cv_accuracy,
        "CV AUC": cv_auc,
        "CV MAE": cv_mae,
        "CV F1": cv_f1,
        "Test Accuracy": test_accuracy,
        "Test AUC": test_auc,
        "Test MAE": test_mae,
        "Test F1": test_f1
    }

# Display the results
print("\nModel Evaluation Results:")
for model_name, metrics in results.items():
    print(f"\n{model_name} Evaluation:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Filter out models with any 1.0 metric, if any are left
filtered_results = {model: metrics for model, metrics in results.items() 
                    if all(metric != 1.0 for metric in metrics.values())}

# If no models remain after filtering, display a message
if not filtered_results:
    print("\nNo models remain after filtering out overfitting models with metrics of 1.0.")
else:
    # Select the best model based on the balance of metrics (accuracy, AUC, F1, MAE)
    best_model = max(filtered_results, key=lambda x: (
        filtered_results[x]["Test Accuracy"] + 
        filtered_results[x]["Test AUC"] + 
        filtered_results[x]["Test F1"] - 
        filtered_results[x]["Test MAE"]
    ))  # Higher Accuracy, AUC, F1 and lower MAE are preferred

    print(f"\nBest Model: {best_model}")
    print(f"Best Model Metrics: {filtered_results[best_model]}")

    # Access the trained model object
    best_model_name = best_model
    best_model = models[best_model_name]  # This is the trained best model


In [None]:
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import pandas as pd
import warnings

# Suppress specific warnings from LIME (optional)
warnings.filterwarnings("ignore", category=FutureWarning)

# Define the LimeTabularExplainer
lime_explainer = LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=['Not Approved', 'Approved'],
    mode='classification'
)
# Initialize an empty list to store explanations
explanations = []


# Suppress the specific warnings from the deprecated usage in lime
warnings.filterwarnings("ignore", category=FutureWarning)

# Define the LimeTabularExplainer
lime_explainer = LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=['Not Approved', 'Approved'],  # Adjust according to your dataset
    mode='classification'
)

# Pick a single instance from the test set to explain for testing
i = 0  # Index of the instance to explain

exp = lime_explainer.explain_instance(X_test.iloc[i], best_model.predict_proba)

# Display the explanation for the instance
exp.show_in_notebook(show_table=True)


In [None]:
import re

# Suppress specific warnings from LIME (optional)
warnings.filterwarnings("ignore", category=FutureWarning)

# Function to extract the feature name from the LIME explanation
def extract_feature_name(feature_str):
    # Split on '<', '>', '=', '!=', etc., and take the first part
    split_symbols = ['<=', '>=', '<', '>', '!=', '=']
    for symbol in split_symbols:
        if symbol in feature_str:
            feature_name = feature_str.split(symbol)[0].strip()
            break
    else:
        feature_name = feature_str.strip()
    # Remove any numeric values
    feature_name = re.sub(r'[0-9]+', '', feature_name)
    # Remove any non-alphanumeric characters except spaces
    feature_name = re.sub(r'[^A-Za-z0-9 ]+', '', feature_name)
    # Replace multiple spaces with a single space
    feature_name = re.sub(r'\s+', ' ', feature_name)
    return feature_name.strip()

# Initialize the LimeTabularExplainer for the entire feature dataset `X`
lime_explainer = LimeTabularExplainer(
    training_data=np.array(X),
    feature_names=X.columns,
    class_names=['Not Approved', 'Approved'],  # Adjust according to your dataset
    mode='classification'
)

# Initialize an empty list to store explanations
explanations = []

# Loop over the instances in the entire feature dataset `X`
for idx, instance in X.iterrows():
    # Get the model's prediction
    prediction = best_model.predict(instance.values.reshape(1, -1))[0]
    # Get the actual label from `y`
    actual_label = y.loc[idx]
    # Generate LIME explanation
    exp = lime_explainer.explain_instance(
        data_row=instance,
        predict_fn=best_model.predict_proba,
        num_features=len(X.columns)  # Consider all features
    )
    # Get the list of (feature, contribution) pairs
    explanation_list = exp.as_list()

    # Determine the decision (Approved or Denied)
    decision = 'Approved' if prediction == 1 else 'Denied'

    # Collect features contributing to the predicted class
    features_contributing = []
    for feature, weight in explanation_list:
        # For the predicted class, collect all features regardless of weight
        if (prediction == 1 and weight > 0) or (prediction == 0 and weight < 0):
            feature_name = extract_feature_name(feature)
            if feature_name and feature_name not in features_contributing:
                features_contributing.append(feature_name)

    # Construct the explanation text
    if features_contributing:
        explanation_text = f"This application was {decision.lower()} due to " + ", ".join(features_contributing) + "."
    else:
        explanation_text = f"This application was {decision.lower()}."

    # Append the explanation and related information
    explanations.append({
        'Index': idx,
        'Prediction': decision,
        'Actual': 'Approved' if actual_label == 1 else 'Denied',
        'Explanation': explanation_text
    })


In [None]:
# Convert the explanations to a DataFrame
explanations_df = pd.DataFrame(explanations)

# Output the first explanation
first_explanation = explanations_df.iloc[0]

# Print the details
print("Index:", first_explanation['Index'])
print("Prediction:", first_explanation['Prediction'])
print("Actual:", first_explanation['Actual'])
print("Explanation:", first_explanation['Explanation'])


In [None]:
explanations_df

In [None]:
#reimport data due to earlier augmentation

df = pd.read_csv("../dataset/credit-card-approval/credit_card_approval.csv")
df

In [None]:
# Add the 'Explanation' column (Reason) from explanations_df to the original df
df['Reason'] = explanations_df['Explanation']

# Save the updated dataframe with the new column as a CSV file
df.to_csv('../dataset/target-augmented.csv', index=False)

In [None]:
df