In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from imblearn.over_sampling import SMOTE

In [None]:
cr_data = pd.read_csv("credit_risk_dataset.csv")

In [None]:
risk_level_mapping = {'Low': 0, 'Medium': 1, 'High': 2}

In [None]:
cr_data['risk_level'] = cr_data['risk_level'].map(risk_level_mapping)

In [None]:
categorical_columns = cr_data.select_dtypes(include=['object']).columns.tolist()

In [None]:
encoder = OneHotEncoder(sparse_output=False)

In [None]:
# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(cr_data[categorical_columns]).astype(int)

In [None]:
joblib.dump(encoder, "encoder.pkl")

In [None]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

In [None]:
# We delete the one hot coded and binary columns so that we can remain with data to scale
data_to_scale = cr_data.drop(['marital_status', 'education_level', 'relationship_to_student', 'employment_status', 'existing_loans', 
    'financial_counseling', 'risk_level', 'risk_score'], axis=1)

In [None]:
scaler = StandardScaler()

In [None]:
# We scale the data 
scaled_data = scaler.fit_transform(data_to_scale)

In [None]:
joblib.dump(scaler, "scaler.pkl")

In [None]:
scaled_df = pd.DataFrame(scaled_data, columns=['age', 'no_of_dependants', 'income_in_kes', 'additional_income', 'employment_length', 
    'guarantor_credit_score', 'outstanding_loan_amount', 'monthly_repayment_amount', 'monthly_expenses', 'missed_payments_last_year'])

In [None]:
# Convert boolean columns (True/False) to numerical (1/0)
boolean_columns = ['existing_loans', 'financial_counseling']
boolean_columns = cr_data[boolean_columns].astype(int)

In [None]:
cr_data_combined = pd.concat([scaled_df, one_hot_df, boolean_columns],axis=1)
cr_data_combined['risk_level'] = cr_data['risk_level']

In [None]:
target = cr_data_combined['risk_level']
features = cr_data_combined.drop('risk_level', axis=1)

In [None]:
smote = SMOTE()

In [None]:
balanced_features, balanced_target = smote.fit_resample(features,target)

In [None]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(balanced_features, balanced_target, test_size=0.2, random_state=42)

## Logistic Regression Classifier

In [None]:
# Initialize Logistic Regression model
logit_model = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
logit_model.fit(X_train, y_train)

In [None]:
joblib.dump(logit_model, "lr_risk_model.pkl")

In [None]:
# Evaluate
lr_pred = logit_model.predict(X_test)

In [None]:
# Calculate performance metrics
def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} Model Evaluation ===")
    print("Logistic Regression - Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print("\n")

In [None]:
# Evaluate the model
evaluate_model(y_test, lr_pred, "Credit Risk Prediction")

## Random Forest Classifier

In [None]:
# Train
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
joblib.dump(rf_model, "rf_risk_model.pkl")

In [None]:
# Evaluate
rf_pred = rf_model.predict(X_test)

In [None]:
# Evaluate the model
evaluate_model(y_test, rf_pred, "Credit Risk Prediction")

In [None]:
features_imp_rf = pd.DataFrame({'features': balanced_features.columns, 'rf_imp': rf_model.feature_importances_})

## K-Nearest Neighbors

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn_model.fit(X_train, y_train)

In [None]:
knn_pred = knn_model.predict(X_test)

In [None]:
evaluate_model(y_test, knn_pred, "Credit Risk Prediction")

## XGBoost Classifier

In [None]:
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
xgb_pred = xgb_model.predict(X_test)

In [None]:
evaluate_model(y_test, xgb_pred, "Credit Risk Prediction")

## Support Vector Machine

In [None]:
svm_model = SVC(kernel='rbf', C=1, gamma='scale')

In [None]:
svm_model.fit(X_train, y_train)

In [None]:
svm_pred = svm_model.predict(X_test)

In [None]:
evaluate_model(y_test, svm_pred, "Credit Risk Prediction")

## Decision Tree Classifier

In [None]:
dtc_model = DecisionTreeClassifier(max_depth=5, random_state=42)

In [None]:
dtc_model.fit(X_train, y_train)

In [None]:
dtc_pred = dtc_model.predict(X_test)

In [None]:
evaluate_model(y_test, dtc_pred, "Credit Risk Prediction")