In [118]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import randint

In [119]:
# Load and preprocess credit card approval data
data_path = "datasets/cc_approvals.data"
column_names = ['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel',
                'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore',
                'DriversLicense', 'Citizen', 'ZipCode', 'Income', 'ApprovalStatus']

credit_data = pd.read_csv(data_path, header=None, names=column_names)

# Replace '?' with NaN for missing values
credit_data.replace('?', np.nan, inplace=True)

# Drop rows containing missing values
credit_data.dropna(inplace=True)

# Convert categorical variables to numerical using label encoding
categorical_columns = ['Gender', 'Married', 'BankCustomer', 'EducationLevel',
                        'Ethnicity', 'PriorDefault', 'Employed', 'DriversLicense', 'Citizen']

for column in categorical_columns:
    credit_data[column] = pd.Categorical(credit_data[column]).codes

# Encode the target variable
label_encoder = LabelEncoder()
credit_data['ApprovalStatus'] = label_encoder.fit_transform(credit_data['ApprovalStatus'])

# Displaying basic information about the data
credit_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 653 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          653 non-null    int8   
 1   Age             653 non-null    object 
 2   Debt            653 non-null    float64
 3   Married         653 non-null    int8   
 4   BankCustomer    653 non-null    int8   
 5   EducationLevel  653 non-null    int8   
 6   Ethnicity       653 non-null    int8   
 7   YearsEmployed   653 non-null    float64
 8   PriorDefault    653 non-null    int8   
 9   Employed        653 non-null    int8   
 10  CreditScore     653 non-null    int64  
 11  DriversLicense  653 non-null    int8   
 12  Citizen         653 non-null    int8   
 13  ZipCode         653 non-null    object 
 14  Income          653 non-null    int64  
 15  ApprovalStatus  653 non-null    int64  
dtypes: float64(2), int64(3), int8(9), object(2)
memory usage: 46.6+ KB


First few rows of data:

In [120]:
# Displaying the first few rows of the dataset
credit_data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,1,30.83,0.0,1,0,12,7,1.25,1,1,1,0,0,202,0,0
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,43,560,0
2,0,24.5,0.5,1,0,10,3,1.5,1,0,0,0,0,280,824,0
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,100,3,0
4,1,20.17,5.625,1,0,12,7,1.71,1,0,0,0,2,120,0,0


In [174]:
# Define feature combinations
all_feature_combinations = [
  ['Age', 'CreditScore', 'Employed', 'Married', 'Ethnicity'],
  ['Age','CreditScore', 'PriorDefault', 'Debt'],
  ['Age', 'CreditScore', 'Married', 'BankCustomer', 'Income'],
  ['Age', 'CreditScore', 'Married', 'BankCustomer', 'Income', 'PriorDefault', 'Debt', 'Income'],
  ['Income', 'Employed', 'CreditScore', 'PriorDefault', 'Ethnicity'],
  ['Age', 'BankCustomer', 'Debt', 'Married', 'Ethnicity', 'Income'],
  ['CreditScore', 'Employed', 'Married', 'PriorDefault', 'Income'],
  ['Age', 'CreditScore', 'Married', 'Ethnicity', 'PriorDefault', 'Debt'],
  ['Age', 'CreditScore', 'PriorDefault', 'Employed', 'Debt', 'Income'],
  ['CreditScore', 'PriorDefault', 'Employed', 'Debt', 'Income'],
  ['Age', 'Income', 'Debt', 'Ethnicity', 'PriorDefault', 'BankCustomer'],
  ['Age', 'CreditScore', 'Employed', 'Ethnicity', 'Married', 'PriorDefault'],
  ['Age', 'Income', 'Debt', 'Employed', 'Ethnicity', 'Married'],
  ['Age', 'CreditScore', 'Employed', 'Married', 'PriorDefault', 'Debt', 'Income'],
  ['Age', 'CreditScore', 'PriorDefault', 'Employed', 'Debt', 'Income', 'Married'],
  ['Age', 'Income', 'Debt', 'Ethnicity', 'PriorDefault', 'BankCustomer', 'CreditScore'],
  ['Age', 'CreditScore', 'Employed', 'Ethnicity', 'Married', 'PriorDefault', 'Debt'],
  ['Age', 'Income', 'Debt', 'Employed', 'Ethnicity', 'Married', 'CreditScore']]

# Initialize an empty dictionary to store results
results_dict = {}

# Define the parameter distribution to search
param_dist = {
    'max_depth': [None, 10,15,20,25, 30,35],
    'min_samples_split': [2, 5, 8, 10, 15, 18],
    'min_samples_leaf': [2, 5, 8, 10, 15, 18]

}


Perform tests to see the best combination of features

In [175]:
# Loop through all feature combinations
for features in all_feature_combinations:
     # Select features
    credit_features = credit_data[features]
    credit_labels = credit_data['ApprovalStatus']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(credit_features, credit_labels, test_size=0.313, random_state=55)

    # Initialize the Decision Tree model
    dt_model = DecisionTreeClassifier(random_state=55)

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(dt_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=55)
    random_search.fit(X_train, y_train)

    # Get the best parameters and the corresponding model
    best_params = random_search.best_params_
    best_model = random_search.best_estimator_

    # Make predictions on the test set using the best model
    y_pred = best_model.predict(X_test)

    # Evaluate the best model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Store results in the dictionary
    results_dict[str(features)] = {
        'Feature Combination': features,
        'Best Hyperparameters': best_params,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix
    }

Final Results

In [176]:

# Initialize variables to keep track of the best accuracy and its corresponding parameters
best_accuracy = 0.0
best_params = {}

# Print results for each variation of param_dist and all_feature_combinations
for features, results in results_dict.items():
    print(f"Features: {results['Feature Combination']}")
    print(f"Best Hyperparameters: {results['Best Hyperparameters']}")
    print(f"Accuracy: {results['Accuracy']:.4f}")
    print("Confusion Matrix:")
    print(results["Confusion Matrix"])
    print("="*50)

    # Update best accuracy if the current accuracy is higher
    if results['Accuracy'] > best_accuracy:
        best_accuracy = results['Accuracy']
        best_params = {'Feature Combination': results['Feature Combination'], 'Hyperparameters': results['Best Hyperparameters']}


Features: ['Age', 'CreditScore', 'Employed', 'Married', 'Ethnicity']
Best Hyperparameters: {'min_samples_split': 15, 'min_samples_leaf': 15, 'max_depth': 25}
Accuracy: 0.7171
Confusion Matrix:
[[59 30]
 [28 88]]
Features: ['Age', 'CreditScore', 'PriorDefault', 'Debt']
Best Hyperparameters: {'min_samples_split': 15, 'min_samples_leaf': 15, 'max_depth': 25}
Accuracy: 0.9073
Confusion Matrix:
[[ 83   6]
 [ 13 103]]
Features: ['Age', 'CreditScore', 'Married', 'BankCustomer', 'Income']
Best Hyperparameters: {'min_samples_split': 18, 'min_samples_leaf': 10, 'max_depth': 25}
Accuracy: 0.7220
Confusion Matrix:
[[57 32]
 [25 91]]
Features: ['Age', 'CreditScore', 'Married', 'BankCustomer', 'Income', 'PriorDefault', 'Debt', 'Income']
Best Hyperparameters: {'min_samples_split': 15, 'min_samples_leaf': 15, 'max_depth': 25}
Accuracy: 0.8976
Confusion Matrix:
[[ 82   7]
 [ 14 102]]
Features: ['Income', 'Employed', 'CreditScore', 'PriorDefault', 'Ethnicity']
Best Hyperparameters: {'min_samples_split':

In [177]:
# Print the overall best accuracy and its corresponding parameters
print(f"\nOverall Best Accuracy: {best_accuracy:.4f}")
print(f"Corresponding Feature Combination: {best_params['Feature Combination']}")
print(f"Corresponding Hyperparameters: {best_params['Hyperparameters']}")


Overall Best Accuracy: 0.9073
Corresponding Feature Combination: ['Age', 'CreditScore', 'PriorDefault', 'Debt']
Corresponding Hyperparameters: {'min_samples_split': 15, 'min_samples_leaf': 15, 'max_depth': 25}


# More Tests

In [191]:
test_feature_combinations = [
['Age','CreditScore', 'PriorDefault', 'Debt' ,'BankCustomer' ]]

# Initialize an empty dictionary to store results
test_dict = {}


# Loop through all feature combinations
for features in test_feature_combinations:
    # Select features
    credit_features = credit_data[features]
    credit_labels = credit_data['ApprovalStatus']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(credit_features, credit_labels, test_size=0.313, random_state=55)

    # Initialize the Decision Tree model
    dt_model = DecisionTreeClassifier(random_state=55)

    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(dt_model, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=55)
    random_search.fit(X_train, y_train)

    # Get the best parameters and the corresponding model
    best_params = random_search.best_params_
    best_model = random_search.best_estimator_

    # Make predictions on the test set using the best model
    y_pred = best_model.predict(X_test)

    # Evaluate the best model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Store results in the dictionary
    test_dict[str(features)] = {
        'Feature Combination': features,
        'Best Hyperparameters': best_params,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix
    }

In [196]:

# Initialize variables to keep track of the best accuracy and its corresponding parameters
best_accuracy = 0.0
best_params = {}

# Print results for each variation of param_dist and all_feature_combinations
for features, results in test_dict.items():
    print(f"Features: {results['Feature Combination']}")
    print(f"Best Hyperparameters: {results['Best Hyperparameters']}")
    print(f"Accuracy: {results['Accuracy']:.4f}")
    print("Confusion Matrix:")
    print(results["Confusion Matrix"])
    print("="*90)

    # Update best accuracy if the current accuracy is higher
    if results['Accuracy'] > best_accuracy:
        best_accuracy = results['Accuracy']
        best_params = {'Feature Combination': results['Feature Combination'], 'Hyperparameters': results['Best Hyperparameters']}


Features: ['Age', 'CreditScore', 'PriorDefault', 'Debt', 'BankCustomer']
Best Hyperparameters: {'min_samples_split': 15, 'min_samples_leaf': 15, 'max_depth': 25}
Accuracy: 0.9073
Confusion Matrix:
[[ 83   6]
 [ 13 103]]


In [184]:
# Print the overall best accuracy and its corresponding parameters
print(f"\nOverall Best Accuracy: {best_accuracy:.4f}")
print(f"Corresponding Feature Combination: {best_params['Feature Combination']}")
print(f"Corresponding Hyperparameters: {best_params['Hyperparameters']}")


Overall Best Accuracy: 0.9073
Corresponding Feature Combination: ['Age', 'CreditScore', 'PriorDefault', 'Debt', 'Employed']
Corresponding Hyperparameters: {'min_samples_split': 15, 'min_samples_leaf': 15, 'max_depth': 25}
