In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
#https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction
# Load the application_record dataset
data = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/application_record.csv")

# Load the credit_record dataset
record = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/credit_record.csv")

# Flag to skip training and load the saved model
skip_training = False

In [2]:
# Find the first account open month for each user
begin_month = record.loc[record.groupby("ID")["MONTHS_BALANCE"].idxmin()]
begin_month = begin_month.rename(columns={"MONTHS_BALANCE": "begin_month"})

# Merge the datasets
df = pd.merge(data, begin_month, how="left", on="ID")
print("Datasets loaded and merged successfully.")
df.head()

Datasets loaded and merged successfully.


Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,STATUS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,-15.0,X
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,-14.0,X
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474.0,-1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,-29.0,X
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110.0,-3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,-4.0,X
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110.0,-3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,-26.0,X


In [3]:
# Define approval logic based on multiple criteria
def determine_approval(row):
    # Define custom approval logic
    if row["STATUS"] in ["0", "1", "C", "X"]:  # Good credit status
            return 1  # Approved
    return 0  # Default to denial if STATUS is bad or missing


In [4]:
# Apply logic to determine approval (filling missing STATUS values first)
record["STATUS"] = record["STATUS"].fillna("X")  # Handle missing values
record["Approved"] = record.apply(determine_approval, axis=1)

# Aggregate approval status for each ID (disapproval if any ID has disqualifying criteria)
approval_status = record.groupby("ID")["Approved"].min().reset_index()

# Merge approval status back into the main dataset, avoiding "_x" and "_y" columns
df = pd.merge(data, approval_status, how="left", on="ID")
df["Approved"] = df["Approved"].fillna(0).astype(int)  # Fill missing approvals as denial
print("Approval status merged successfully.")

# Display the head of the resulting DataFrame
df.head()

Approval status merged successfully.


Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,Approved
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,1
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005.0,-4542.0,1.0,1.0,0.0,0.0,,2.0,1
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474.0,-1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110.0,-3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110.0,-3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1


In [5]:
# Preprocess the 'DAYS_BIRTH' column to convert days to years
df['DAYS_BIRTH'] = (-df['DAYS_BIRTH'] // 365).fillna(0).astype(int)
df.drop(columns=['ID'], inplace=True)
# Preprocess the 'DAYS_EMPLOYED' column to get absolute values and handle unemployment
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].apply(lambda x: abs(x) if x < 0 else 0)

# # Convert 'begin_month' to positive months if it exists, indicating the number of months ago
# if 'begin_month' in df.columns:
#     df['begin_month'] = df['begin_month'].abs()

# Handle missing or infinite values in numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = df[numerical_cols].replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())  # Fill NaN with median values

# Display a sample of the processed dataset
print("Preprocessing completed successfully.")
df.head()

Preprocessing completed successfully.


Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,Approved
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1


In [6]:
# Define the feature mapping dictionary
feature_mapping = {
    'CODE_GENDER': 'Gender',
    'FLAG_OWN_CAR': 'Car Ownership',
    'FLAG_OWN_REALTY': 'Property Ownership',
    'CNT_CHILDREN': 'Number of Children',
    'AMT_INCOME_TOTAL': 'Annual Income',
    'NAME_INCOME_TYPE': 'Income Category',
    'NAME_EDUCATION_TYPE': 'Education Level',
    'NAME_FAMILY_STATUS': 'Marital Status',
    'NAME_HOUSING_TYPE': 'Housing Type',
    'DAYS_BIRTH': 'Age (Days)',
    'DAYS_EMPLOYED': 'Employment Duration (Days)',
    'FLAG_MOBIL': 'Mobile Phone',
    'FLAG_WORK_PHONE': 'Work Phone',
    'FLAG_PHONE': 'Phone',
    'FLAG_EMAIL': 'Email',
    'OCCUPATION_TYPE': 'Occupation',
    'CNT_FAM_MEMBERS': 'Family Size',
    'STATUS': 'Credit Status'
}

# Rename the columns in the DataFrame using the mapping
df.rename(columns=feature_mapping, inplace=True)

# Display the first few rows to confirm the changes
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1


In [7]:
df.rename(columns={'TARGET': 'Approved'}, inplace=True)
# Display the first few rows to confirm the change
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1


In [8]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import colorama
from colorama import Fore

# Automatically identify categorical columns from the dataframe based on data types
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Identify potential categorical columns stored as numbers by checking unique values
for col in df.columns:
    if col != 'Approved':  # Skip 'Approved' column
        if df[col].nunique() < 10 and col not in categorical_cols:  # Arbitrary threshold of < 10 unique values
            categorical_cols.append(col)

# Display identified categorical columns
print(f"Identified Categorical Columns (excluding 'Approved'): {categorical_cols}\n")

Identified Categorical Columns (excluding 'Approved'): ['Gender', 'Car Ownership', 'Property Ownership', 'Income Category', 'Education Level', 'Marital Status', 'Housing Type', 'Occupation', 'Number of Children', 'Mobile Phone', 'Work Phone', 'Phone', 'Email', 'Family Size']



In [9]:
# Initialize a list to store bad predictors
bad_predictors = []

# You can change the threshold for what is considered a "bad predictor"
p_value_threshold = 0.05  # Adjust this if needed

# Evaluate each categorical column using chi-squared test
for col in categorical_cols:
    print(f"{col}:")
    
    # Perform chi-squared test
    a = np.array(pd.crosstab(df['Approved'], df[col]))
    stats, p, dof, _ = chi2_contingency(a, correction=False)
    
    if p > p_value_threshold:
        print(Fore.RED + f"'{col}' is a 'bad Predictor'")
        print(f"p_val = {p}\n")
        bad_predictors.append(col)  # Append the bad predictor to the list
    else:
        print(Fore.GREEN + f"'{col}' is a 'Good Predictor'")
        print(f"p_val = {p}\n")

Gender:
[31m'Gender' is a 'bad Predictor'
p_val = 0.27806351415702424

Car Ownership:
[32m'Car Ownership' is a 'Good Predictor'
p_val = 1.2219580877774481e-06

Property Ownership:
[32m'Property Ownership' is a 'Good Predictor'
p_val = 2.0418876555564286e-41

Income Category:
[32m'Income Category' is a 'Good Predictor'
p_val = 1.2201307680380188e-05

Education Level:
[32m'Education Level' is a 'Good Predictor'
p_val = 0.00027734213668362186

Marital Status:
[32m'Marital Status' is a 'Good Predictor'
p_val = 0.00010079401332765777

Housing Type:
[32m'Housing Type' is a 'Good Predictor'
p_val = 5.216661799214813e-19

Occupation:
[32m'Occupation' is a 'Good Predictor'
p_val = 4.555098330664581e-16

Number of Children:
[32m'Number of Children' is a 'Good Predictor'
p_val = 1.5661798779060523e-15

Mobile Phone:
[31m'Mobile Phone' is a 'bad Predictor'
p_val = 1.0

Work Phone:
[32m'Work Phone' is a 'Good Predictor'
p_val = 3.2568192361922306e-14

Phone:
[32m'Phone' is a 'Good Predi

In [10]:
# Automatically drop bad predictors
df.drop(bad_predictors, axis=1, inplace=True)

# Print the final dataframe structure after dropping
print(f"Dropped bad predictors: {bad_predictors}")
df

Dropped bad predictors: ['Gender', 'Mobile Phone']


Unnamed: 0,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Work Phone,Phone,Email,Occupation,Family Size,Approved
0,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,0.0,0.0,,2.0,1
1,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,0.0,0.0,,2.0,1
2,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,0.0,0.0,0.0,Security staff,2.0,1
3,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,0.0,1.0,1.0,Sales staff,1.0,1
4,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,0.0,1.0,1.0,Sales staff,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8444,Y,Y,1,135000.0,Working,Secondary / secondary special,Married,House / apartment,55,5375.0,0.0,0.0,0.0,Managers,3.0,0
8445,N,Y,0,103500.0,Pensioner,Secondary / secondary special,Widow,House / apartment,65,0.0,0.0,0.0,0.0,,1.0,0
8446,N,Y,0,103500.0,Pensioner,Secondary / secondary special,Widow,House / apartment,65,0.0,0.0,0.0,0.0,,1.0,0
8447,N,Y,0,103500.0,Pensioner,Secondary / secondary special,Widow,House / apartment,65,0.0,0.0,0.0,0.0,,1.0,0


In [11]:
original_df = df.copy()
original_df.head()

Unnamed: 0,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Work Phone,Phone,Email,Occupation,Family Size,Approved
0,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,0.0,0.0,,2.0,1
1,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,0.0,0.0,,2.0,1
2,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,0.0,0.0,0.0,Security staff,2.0,1
3,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,0.0,1.0,1.0,Sales staff,1.0,1
4,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,0.0,1.0,1.0,Sales staff,1.0,1


In [12]:
df.columns = df.columns.str.strip()  # Removes leading and trailing spaces from all column names
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Print columns to verify correctness
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# Remove 'Approved' if it's in either list
if 'Approved' in categorical_cols:
    categorical_cols.remove('Approved')
if 'Approved' in numerical_cols:
    numerical_cols.remove('Approved')

# Check the results
print("Categorical columns after removal:", categorical_cols)
print("Numerical columns after removal:", numerical_cols)

Categorical columns: ['Car Ownership', 'Property Ownership', 'Income Category', 'Education Level', 'Marital Status', 'Housing Type', 'Occupation']
Numerical columns: ['Number of Children', 'Annual Income', 'Age (Days)', 'Employment Duration (Days)', 'Work Phone', 'Phone', 'Email', 'Family Size', 'Approved']
Categorical columns after removal: ['Car Ownership', 'Property Ownership', 'Income Category', 'Education Level', 'Marital Status', 'Housing Type', 'Occupation']
Numerical columns after removal: ['Number of Children', 'Annual Income', 'Age (Days)', 'Employment Duration (Days)', 'Work Phone', 'Phone', 'Email', 'Family Size']


In [13]:
print("Data types of the columns:")
print(df.dtypes)

print("\nCategorical columns identified:")
print(categorical_cols)

Data types of the columns:
Car Ownership                  object
Property Ownership             object
Number of Children              int64
Annual Income                 float64
Income Category                object
Education Level                object
Marital Status                 object
Housing Type                   object
Age (Days)                      int64
Employment Duration (Days)    float64
Work Phone                    float64
Phone                         float64
Email                         float64
Occupation                     object
Family Size                   float64
Approved                        int64
dtype: object

Categorical columns identified:
['Car Ownership', 'Property Ownership', 'Income Category', 'Education Level', 'Marital Status', 'Housing Type', 'Occupation']


In [14]:
original_df.head()

NameError: name 'original1_df' is not defined

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Initialize LabelEncoders for categorical variables
label_encoders = {}
for col in categorical_cols:
    if col != 'Approved':  # Exclude the target column from encoding
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Store the encoder for future use (e.g., inverse_transform)

print("Unique values in categorical columns after encoding:")
for col in categorical_cols:
    if col != 'Approved':  # Ensure target is not being encoded
        print(f"{col}: {df[col].unique()}")
print("\nData types of the columns after encoding:")
print(df.dtypes)


In [None]:
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Select numerical features (excluding 'Approved')
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

# Remove 'Approved' from the list of columns to scale
if 'Approved' in numerical_cols:
    numerical_cols.remove('Approved')
train_df = df.copy()

# Scale only the numerical feature columns
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Separate features (X) and target variable (y)
X = train_df.drop('Approved', axis=1)  # Drop 'Approved' from features
y = train_df['Approved']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting splits
print("Training set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)
print(train_df['Approved'].value_counts())  # Ensure the target has values like 0/1 or yes/no
train_df.head()

In [None]:
print(df['Approved'].value_counts())  # Ensure the target has values like 0/1 or yes/no
df.head()

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
# Reduce dataset size using stratified sampling (optional)
# X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.1, stratify=y, random_state=42)

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
print(X_train.shape)
print(X_test.shape)

In [None]:
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_auc_score, f1_score

# Set up K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Directory to save the best model
model_save_dir = "/opt/notebooks/Chatbot-Credit-Card/backend/models"
os.makedirs(model_save_dir, exist_ok=True)

# List of models to evaluate with optimizations
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42, ccp_alpha=0.01, max_depth=6, min_samples_split=20),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=50, max_depth=6, n_jobs=-1, min_samples_split=10, max_features='sqrt'),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000, C=0.5, n_jobs=-1),
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss', tree_method='gpu_hist', n_estimators=50, max_depth=6, subsample=0.8, colsample_bytree=0.8, reg_lambda=10)
}

# Dictionary to store evaluation results and trained models
results = {}
trained_models = {}

In [None]:
# Iterate over each model with cross-validation
for model_name, model in models.items():
    if skip_training:
        model_files = [f for f in os.listdir(model_save_dir) if f.endswith(".pkl")]
        if len(model_files) == 1:
            model_path = os.path.join(model_save_dir, model_files[0])
            print(f"Loading the only available model: {model_files[0]}")
            with open(model_path, "rb") as f:
                best_model = pickle.load(f)
            print(f"Loaded model: {model_files[0]}")
    if not skip_training or not os.path.exists(model_path):
        # Cross-validate the model
        cv_accuracy = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy').mean()
        cv_f1 = cross_val_score(model, X_train, y_train, cv=kf, scoring='f1').mean()
        cv_auc = cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc').mean()
        cv_mae = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error').mean()
        
        # Train the model on the full training data
        model.fit(X_train, y_train)
        trained_models[model_name] = model  # Store the trained model
        
        # Make predictions on the test data
        y_test_pred = model.predict(X_test)
        
        # For AUC-ROC, we need the probability scores for the positive class
        y_test_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        
        # Evaluate the model on the test set
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred)
        test_auc = roc_auc_score(y_test, y_test_prob) if y_test_prob is not None else None
        
        # Check if any metric is 1.0, and skip such models
        if test_accuracy == 1.0 or test_auc == 1.0 or test_f1 == 1.0 or test_mae == 0.0:
            print(f"{model_name} is overfitting with a metric of 1.0, skipping this model.")
            continue
        
        # Store the evaluation results
        results[model_name] = {
            "CV Accuracy": cv_accuracy,
            "CV AUC": cv_auc,
            "CV MAE": cv_mae,
            "CV F1": cv_f1,
            "Test Accuracy": test_accuracy,
            "Test AUC": test_auc,
            "Test MAE": test_mae,
            "Test F1": test_f1
        }
    
    # Display the results
    print("\nModel Evaluation Results:")
    for model_name, metrics in results.items():
        print(f"\n{model_name} Evaluation:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")
    
    # Filter out models with any 1.0 metric, if any are left
    filtered_results = {model: metrics for model, metrics in results.items()
                        if all(metric != 1.0 for metric in metrics.values())}
    
    # If no models remain after filtering, display a message
    if not filtered_results:
        print("\nNo models remain after filtering out overfitting models with metrics of 1.0.")
    else:
        best_model_name = max(filtered_results, key=lambda x: (
            filtered_results[x]["Test Accuracy"] / 1.0 +  # Normalize to [0,1]
            filtered_results[x]["Test AUC"] / 1.0 +
            filtered_results[x]["Test F1"] / 1.0 -
            filtered_results[x]["Test MAE"] / 100.0      # Scale appropriately
        ))
    
        best_model = trained_models[best_model_name]  # Retrieve the trained best model
        
        print(f"\nBest Model: {best_model_name}")
        print(f"Best Model Metrics: {filtered_results[best_model_name]}")
        
        # Save the best model
        model_path = os.path.join(model_save_dir, f"{best_model_name.replace(' ', '_')}.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(best_model, f)
        print(f"Best model saved: {best_model_name}")

In [None]:
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import pandas as pd
import warnings

# Suppress specific warnings from LIME (optional)
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialize the LimeTabularExplainer with training data
lime_explainer = LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns.tolist(),
    class_names=['Not Approved', 'Approved'],
    mode='classification'
)

#Define a prediction function for LIME
def predict_fn(x):
    return best_model.predict_proba(x)
i = 0  # Index of the instance to explain
instance = X_test.iloc[i]

# Generate explanation for the instance
exp = lime_explainer.explain_instance(
    data_row=instance.values,
    predict_fn=predict_fn,
)

# Display the explanation for the instance
exp.show_in_notebook(show_table=True)

In [None]:
import re
from lime.lime_tabular import LimeTabularExplainer
# Suppress specific warnings from LIME (optional)
warnings.filterwarnings("ignore", category=FutureWarning)

# Function to extract the feature name from the LIME explanation
# Modify the function to include feature values in the explanation
def extract_feature_value(feature_str, instance):
    # Extract feature name
    split_symbols = ['<=', '>=', '<', '>', '!=', '=']
    for symbol in split_symbols:
        if symbol in feature_str:
            feature_name = feature_str.split(symbol)[0].strip()
            break
    else:
        feature_name = feature_str.strip()

    # Remove non-alphanumeric characters except spaces
    cleaned_feature_name = re.sub(r'[^A-Za-z0-9 ]+', '', feature_name).strip()

    # Map cleaned feature name back to the dataset's column names
    actual_feature_name = next(
        (col for col in instance.index if cleaned_feature_name.lower() in col.lower()), 
        None
    )

    if actual_feature_name is None:
        raise KeyError(f"Feature '{cleaned_feature_name}' not found in dataset columns.")

    # Retrieve the actual value of the feature
    feature_value = instance[actual_feature_name]
    return f"{actual_feature_name} {feature_value}"

In [None]:
# Initialize the LimeTabularExplainer with the feature dataset (exclude 'Approved' column)
lime_explainer = LimeTabularExplainer(
    training_data=X.values,  # Use X instead of df
    feature_names=X.columns.tolist(),
    class_names=['Not Approved', 'Approved'],  # Adjust according to your dataset
    mode='classification'
    # kernel_width=3  # Smaller kernel width for faster computations
)

# Initialize an empty list to store explanations
explanations = []
# Define a prediction function compatible with LIME (outside the loop)
def predict_fn(x):
    return best_model.predict_proba(np.array(x))

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Modified function to extract feature name only
def extract_feature_name(feature):
    """
    Extracts only the feature name from the LIME explanation output.
    """
    # Assuming the feature is presented as 'feature_name=value' or with inequalities
    # Remove anything after the first space or operator
    for delimiter in ['<', '>', '=', '-']:
        if delimiter in feature:
            return feature.split(delimiter)[0].strip()
    return feature.strip()

# Update the `generate_explanation` function to use `extract_feature_name`
def generate_explanation(idx_instance):
    idx, instance = idx_instance
    instance_values = instance.values.reshape(1, -1)
    prediction = best_model.predict(instance_values)[0]
    actual_label = y.loc[idx]

    exp = lime_explainer.explain_instance(
        data_row=instance.values,
        predict_fn=predict_fn,
        num_features=len(X.columns)
    )

    explanation_list = exp.as_list()
    decision = 'Approved' if prediction == 1 else 'Denied'

    features_contributing = []
    for feature, weight in explanation_list:
        if (prediction == 1 and weight > 0) or (prediction == 0 and weight < 0):
            # Extract the feature name only
            feature_name = extract_feature_name(feature)
            if feature_name and feature_name not in features_contributing:
                features_contributing.append(feature_name)

    if features_contributing:
        explanation_text = f"This application was {decision.lower()} due to " + ", ".join(features_contributing) + "."
    else:
        explanation_text = f"This application was {decision.lower()}."

    return {
        'Index': idx,
        'Prediction': decision,
        'Actual': 'Approved' if actual_label == 1 else 'Denied',
        'Explanation': explanation_text
    }

# Parallel execution using ThreadPoolExecutor
explanations = []  # Initialize explanations list
with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your CPU
    futures = {executor.submit(generate_explanation, item): item for item in X.iterrows()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Generating Explanations"):
        explanations.append(future.result())

In [None]:
print(f"Number of explanations generated: {len(explanations)}")
print(f"Total number of instances: {X.shape[0]}")

In [None]:
# Convert explanations to DataFrame
explanations_df = pd.DataFrame(explanations)
# Set 'Index' as the DataFrame index to align with df
explanations_df.set_index('Index', inplace=True)

# Merge the datasets
original_df = df.join(explanations_df, how='left')
# Display the DataFrame with explanations
original_df.head()

# Output the first explanation
first_explanation = explanations_df.iloc[0]

# Print the details
print("Index:", first_explanation.name)  # Access the index using .name
print("Prediction:", first_explanation['Prediction'])
print("Actual:", first_explanation['Actual'])
print("Explanation:", first_explanation['Explanation'])

In [None]:
explanations_df.to_csv('/opt/notebooks/Chatbot-Credit-Card/backend/dataset/explanations_df.csv', index=False)
explanations_df

In [None]:
# Add the 'Explanation' column (Reason) from explanations_df to the original df
original_df['Reason'] = explanations_df['Explanation']

# Save the updated dataframe with the new column as a CSV file
original_df.to_csv('/opt/notebooks/Chatbot-Credit-Card/backend/dataset/target-augmented.csv', index=False)

In [None]:
print(f"Number of explanations: {len(explanations)}")
print(f"Number of rows in df: {df.shape[0]}")