In [None]:
# pip install ucimlrepo

In [34]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create 2000 records
n = 10000

# Sample features
gender = np.random.choice(['Male', 'Female'], n)
age = np.random.randint(18, 70, n)
income = np.random.normal(loc=50000, scale=15000, size=n).astype(int)
employment_status = np.random.choice(['Employed', 'Self-employed', 'Unemployed', 'Student'], n)
credit_score = np.random.randint(300, 850, n)
existing_debt = np.round(np.random.uniform(0.0, 1.0, n), 2)

# Approval based on simple rule for illustration
approved = ((income > 40000) & (credit_score > 650) & (existing_debt < 0.4)).astype(int)

# Create DataFrame
df = pd.DataFrame({
    'Gender': gender,
    'Age': age,
    'Income': income,
    'Employment_Status': employment_status,
    'Credit_Score': credit_score,
    'Existing_Debt': existing_debt,
    'Approved': approved
})




In [35]:
# Modify the label to be multi-class: 'Approved', 'Not Approved', 'In Progress'
# Logic:
# - Approved: income > 40000, credit_score > 650, existing_debt < 0.4
# - Not Approved: income < 30000 or credit_score < 500
# - In Progress: everyone else

def determine_status(income, credit_score, existing_debt):
    if income > 40000 and credit_score > 650 and existing_debt < 0.4:
        return 'Approved'
    elif income < 30000 or credit_score < 500:
        return 'Not Approved'
    else:
        return 'In Progress'

# Apply function
df['Status'] = df.apply(lambda row: determine_status(row['Income'], row['Credit_Score'], row['Existing_Debt']), axis=1)

# Drop the old binary Approved column
df.drop(columns=['Approved'], inplace=True)

df['Status'].value_counts(), df.head()


(In Progress     4743
 Not Approved    4148
 Approved        1109
 Name: Status, dtype: int64,
    Gender  Age  Income Employment_Status  Credit_Score  Existing_Debt  \
 0    Male   68   60189           Student           674           0.58   
 1  Female   57   39347        Unemployed           681           0.59   
 2    Male   24   18279     Self-employed           788           0.78   
 3    Male   49   43641        Unemployed           608           0.22   
 4    Male   65   50948     Self-employed           434           0.53   
 
          Status  
 0   In Progress  
 1   In Progress  
 2  Not Approved  
 3   In Progress  
 4  Not Approved  )

In [36]:
df.head()

Unnamed: 0,Gender,Age,Income,Employment_Status,Credit_Score,Existing_Debt,Status
0,Male,68,60189,Student,674,0.58,In Progress
1,Female,57,39347,Unemployed,681,0.59,In Progress
2,Male,24,18279,Self-employed,788,0.78,Not Approved
3,Male,49,43641,Unemployed,608,0.22,In Progress
4,Male,65,50948,Self-employed,434,0.53,Not Approved


In [37]:
# Data Cleaning

In [38]:
df.isnull().sum()


Gender               0
Age                  0
Income               0
Employment_Status    0
Credit_Score         0
Existing_Debt        0
Status               0
dtype: int64

In [39]:
# Feature Engineering

In [40]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])  # Male: 1, Female: 0
df = pd.get_dummies(df, columns=['Employment_Status'], drop_first=True)  # One-Hot Encoding


In [41]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Income', 'Credit_Score', 'Existing_Debt']] = scaler.fit_transform(df[['Income', 'Credit_Score', 'Existing_Debt']])


In [42]:
# Splitting the Dataset

In [54]:
X = df.drop(columns=['Status'])
y = df['Status']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [55]:
# Building the Decision Tree Model

In [56]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='gini',random_state=42)
model.fit(X_train, y_train)


DecisionTreeClassifier(random_state=42)

In [57]:
# Model Evaluation:

In [58]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


0.9996666666666667
              precision    recall  f1-score   support

    Approved       1.00      1.00      1.00       333
 In Progress       1.00      1.00      1.00      1415
Not Approved       1.00      1.00      1.00      1252

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

[[ 332    1    0]
 [   0 1415    0]
 [   0    0 1252]]


In [62]:
from sklearn.metrics import roc_auc_score

# Get the predicted probabilities for each class
y_prob = model.predict_proba(X_test)

# Calculate AUC using One-vs-Rest (OvR) strategy
auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')

# Calculate Gini Coefficient
gini_score = 2 * auc - 1

# Display AUC and Gini Coefficient
print(f"AUC: {auc}")
print(f"Gini Coefficient: {gini_score}")


AUC: 0.9996845425867508
Gini Coefficient: 0.9993690851735015


In [None]:
# Fine-tuning the Model (Optional)

In [30]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)


{'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [31]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(scores)


[1.     0.9995 0.9995 1.     1.    ]
