In [55]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [56]:
cr_data = pd.read_csv("credit_risk_dataset.csv")

In [57]:
risk_level_mapping = {'Low': 0, 'Medium': 1, 'High': 2}

In [58]:
cr_data['risk_level'] = cr_data['risk_level'].map(risk_level_mapping)

In [59]:
categorical_columns = cr_data.select_dtypes(include=['object']).columns.tolist()

In [60]:
encoder = OneHotEncoder(sparse_output=False)

In [61]:
# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(cr_data[categorical_columns]).astype(int)

In [62]:
joblib.dump(encoder, "encoder.pkl")

['encoder.pkl']

In [63]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

In [64]:
# We delete the one hot coded and binary columns so that we can remain with data to scale
data_to_scale = cr_data.drop(['marital_status', 'education_level', 'relationship_to_student', 'employment_status', 'existing_loans', 
    'financial_counseling', 'risk_level', 'risk_score'], axis=1)

In [65]:
scaler = StandardScaler()

In [66]:
# We scale the data 
scaled_data = scaler.fit_transform(data_to_scale)

In [67]:
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [68]:
scaled_df = pd.DataFrame(scaled_data, columns=['age', 'no_of_dependants', 'income_in_kes', 'additional_income', 'employment_length', 
    'guarantor_credit_score', 'outstanding_loan_amount', 'monthly_repayment_amount', 'monthly_expenses', 'missed_payments_last_year'])

In [69]:
# Convert boolean columns (True/False) to numerical (1/0)
boolean_columns = ['existing_loans', 'financial_counseling']
boolean_columns = cr_data[boolean_columns].astype(int)

In [70]:
cr_data_combined = pd.concat([scaled_df, one_hot_df, boolean_columns],axis=1)
cr_data_combined['risk_level'] = cr_data['risk_level']

In [71]:
target = cr_data_combined['risk_level']
features = cr_data_combined.drop('risk_level', axis=1)

In [72]:
smote = SMOTE()

In [73]:
balanced_features, balanced_target = smote.fit_resample(features,target)

In [74]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(balanced_features, balanced_target, test_size=0.2, random_state=42)

## Logistic Regression Classifier

In [75]:
# Initialize Logistic Regression model
logit_model = LogisticRegression(max_iter=1000, random_state=42)

In [76]:
logit_model.fit(X_train, y_train)

In [77]:
joblib.dump(logit_model, "lr_risk_model.pkl")

['lr_risk_model.pkl']

In [78]:
# Evaluate
lr_pred = logit_model.predict(X_test)

In [79]:
# Calculate performance metrics
def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} Model Evaluation ===")
    print("Logistic Regression - Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print("\n")

In [80]:
# Evaluate the model
evaluate_model(y_test, lr_pred, "Credit Risk Prediction")

=== Credit Risk Prediction Model Evaluation ===
Logistic Regression - Accuracy: 0.994908350305499
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1271
           1       0.99      0.99      0.99      1329
           2       1.00      1.00      1.00      1328

    accuracy                           0.99      3928
   macro avg       0.99      0.99      0.99      3928
weighted avg       0.99      0.99      0.99      3928





## Random Forest Classifier

In [81]:
# Train
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

In [82]:
rf_model.fit(X_train, y_train)

In [83]:
joblib.dump(rf_model, "rf_risk_model.pkl")

['rf_risk_model.pkl']

In [84]:
# Evaluate
rf_pred = rf_model.predict(X_test)

In [85]:
# Evaluate the model
evaluate_model(y_test, rf_pred, "Credit Risk Prediction")

=== Credit Risk Prediction Model Evaluation ===
Logistic Regression - Accuracy: 0.8930753564154786
              precision    recall  f1-score   support

           0       0.82      0.94      0.88      1271
           1       0.93      0.74      0.82      1329
           2       0.94      1.00      0.97      1328

    accuracy                           0.89      3928
   macro avg       0.90      0.89      0.89      3928
weighted avg       0.90      0.89      0.89      3928





In [86]:
features_imp_rf = pd.DataFrame({'features': balanced_features.columns, 'rf_imp': rf_model.feature_importances_})

## K-Nearest Neighbors

In [87]:
knn_model = KNeighborsClassifier(n_neighbors=5)

In [88]:
knn_model.fit(X_train, y_train)

In [89]:
knn_pred = knn_model.predict(X_test)

In [90]:
evaluate_model(y_test, knn_pred, "Credit Risk Prediction")

=== Credit Risk Prediction Model Evaluation ===
Logistic Regression - Accuracy: 0.9177698574338086
              precision    recall  f1-score   support

           0       0.86      0.98      0.91      1271
           1       0.97      0.78      0.86      1329
           2       0.94      1.00      0.97      1328

    accuracy                           0.92      3928
   macro avg       0.92      0.92      0.92      3928
weighted avg       0.92      0.92      0.92      3928





## XGBoost Classifier

In [91]:
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)

In [92]:
xgb_model.fit(X_train, y_train)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

In [93]:
xgb_pred = xgb_model.predict(X_test)

In [94]:
evaluate_model(y_test, xgb_pred, "Credit Risk Prediction")

=== Credit Risk Prediction Model Evaluation ===
Logistic Regression - Accuracy: 0.9717413441955194
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1271
           1       0.97      0.95      0.96      1329
           2       0.99      1.00      1.00      1328

    accuracy                           0.97      3928
   macro avg       0.97      0.97      0.97      3928
weighted avg       0.97      0.97      0.97      3928





## Support Vector Machine

In [95]:
svm_model = SVC(kernel='rbf', C=1, gamma='scale')

In [96]:
svm_model.fit(X_train, y_train)

In [97]:
svm_pred = svm_model.predict(X_test)

In [98]:
evaluate_model(y_test, svm_pred, "Credit Risk Prediction")

=== Credit Risk Prediction Model Evaluation ===
Logistic Regression - Accuracy: 0.9887983706720977
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1271
           1       0.99      0.98      0.98      1329
           2       1.00      1.00      1.00      1328

    accuracy                           0.99      3928
   macro avg       0.99      0.99      0.99      3928
weighted avg       0.99      0.99      0.99      3928





## Decision Tree Classifier

In [99]:
dtc_model = DecisionTreeClassifier(max_depth=5, random_state=42)

In [100]:
dtc_model.fit(X_train, y_train)

In [101]:
dtc_pred = dtc_model.predict(X_test)

In [102]:
evaluate_model(y_test, dtc_pred, "Credit Risk Prediction")

=== Credit Risk Prediction Model Evaluation ===
Logistic Regression - Accuracy: 0.8788187372708758
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      1271
           1       0.89      0.73      0.80      1329
           2       0.91      0.98      0.95      1328

    accuracy                           0.88      3928
   macro avg       0.88      0.88      0.88      3928
weighted avg       0.88      0.88      0.88      3928



