# Decision System

Dataset loading - Extraction

In [4]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
  
# Fetch dataset from UCI repo
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# Split into features and target
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 
  
# Metadata 
print(statlog_german_credit_data.metadata) 
  
# Variable information 
print(statlog_german_credit_data.variables) 

# Rename columns for better understanding
X = X.rename(columns={
    # Categorical features
    'Attribute1': 'checking_account_status',
    'Attribute3': 'credit_history',
    'Attribute4': 'purpose',
    'Attribute6': 'savings_account',
    'Attribute7': 'employment_since',
    'Attribute9': 'personal_status_sex',
    'Attribute10': 'other_debtors',
    'Attribute12': 'property',
    'Attribute14': 'other_installment_plans',
    'Attribute15': 'housing',
    'Attribute17': 'job',
    'Attribute19': 'telephone',
    'Attribute20': 'foreign_worker',
    
    # Numerical features
    'Attribute2': 'duration_months',
    'Attribute5': 'credit_amount',
    'Attribute8': 'installment_rate',
    'Attribute11': 'residence_since',
    'Attribute13': 'age',
    'Attribute16': 'num_existing_credits',
    'Attribute18': 'num_dependents'
})

X.head()


{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

Unnamed: 0,checking_account_status,duration_months,credit_history,purpose,credit_amount,savings_account,employment_since,installment_rate,personal_status_sex,other_debtors,residence_since,property,age,other_installment_plans,housing,num_existing_credits,job,num_dependents,telephone,foreign_worker
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201


Data Preprocessing - Transformation

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Feature identification
categorical_features = X.select_dtypes(include = ['object']).columns.tolist()
print(categorical_features)
numerical_features = X.select_dtypes(include = ['int64']).columns.tolist()
print(numerical_features)

# Data Preprocessing
preprocessor_scaled = ColumnTransformer( # Scaling needed for logistic regression
	transformers = [
		('num', StandardScaler(), numerical_features), # Scale num features
		('cat', OneHotEncoder(drop = 'first', handle_unknown = 'ignore'), categorical_features), # OneHotEncode cat features. Handle unknown to make sure pipeline doesnt work on any unseen cat with new data
	]
)

preprocessor_passthrough = ColumnTransformer( # No need for scaling in RandomForest
	transformers = [
		('num', 'passthrough', numerical_features), # Passthrough direct values
		('cat', OneHotEncoder(drop = 'first', handle_unknown = 'ignore'), categorical_features) # OneHotEncoding still needed
	]
)

# Data Ingestion onto Model
pipelines = {
	'log_reg': Pipeline ([('prep', preprocessor_scaled), ('model', LogisticRegression(max_iter = 1000))]),
	'rf': Pipeline([('prep', preprocessor_passthrough), ('model', RandomForestClassifier(random_state = 42))])
}

['checking_account_status', 'credit_history', 'purpose', 'savings_account', 'employment_since', 'personal_status_sex', 'other_debtors', 'property', 'other_installment_plans', 'housing', 'job', 'telephone', 'foreign_worker']
['duration_months', 'credit_amount', 'installment_rate', 'residence_since', 'age', 'num_existing_credits', 'num_dependents']


Model Evaluation

In [6]:
from sklearn.metrics import roc_auc_score, classification_report

# Reshaping y
y = y.values.ravel() # y is a 2D array, scikit-learn expects a 1D array.
y = y - 1  # Because we convert values from (2 -> 1, 1 -> 0)

# Check class distribution
print('Class distribution (imbalanced dataset):')
print(np.unique(y, return_counts = True))

# Split data with transformed y
X_train, X_test, y_train, y_test = train_test_split(
	X, y,
	test_size = 0.2, # 0.8 train, 0.2 test
	random_state = 42, # Random seed of 42
	stratify = y # 70/30 ratio of imbalanced data is maintained in both train and test sets
)

# Model evaluation with transformed target
for name, pipe in pipelines.items():
	pipe.fit(X_train, y_train)
	proba = pipe.predict_proba(X_test)[:, 1] # Get probability of default. [:, 1] because we are only interested in default, instead of no default [:, 0]
	pred = (proba >= 0.5).astype(int) # Converts array of predicted probabilities from Boolean into 0/1. needed for the classification report
	print(name, f'AUC: {roc_auc_score(y_test, proba):.3f}') # AUC measures how well model separates defaulters vs no-defaulters across all possible thresholds
	print(classification_report(y_test, pred, zero_division = 0)) # We test on y_test as this is the unseen target data


Class distribution (imbalanced dataset):
(array([0, 1]), array([700, 300]))


NameError: name 'train_test_split' is not defined

Applied business rules with risk score

Unnamed: 0,checking_account_status,duration_months,credit_history,purpose,credit_amount,savings_account,employment_since,installment_rate,personal_status_sex,other_debtors,residence_since,property,age,other_installment_plans,housing,num_existing_credits,job,num_dependents,telephone,foreign_worker
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201


Combination of weighted risk scores with model output

Evaluation and Monitoring of Credit Decision System