In [6]:
import pandas as pd
data = pd.read_csv('/workspaces/predict-bank-credit-risk/dataset/SouthGermanCredit.asc', sep=' ')

In [7]:
# scaling the dataset

from sklearn.preprocessing import StandardScaler

def scale_dataset(data, columns_to_scale):
    """
    Scale specified numerical columns in the dataset using StandardScaler.

    Parameters:
    - data: DataFrame, the input dataset
    - columns_to_scale: list, names of numerical columns to be scaled

    Returns:
    - DataFrame, the scaled dataset
    """
    scaler = StandardScaler()
    scaled_data = data.copy()

    # Apply StandardScaler to specified columns
    scaled_data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

    return scaled_data

# Assuming 'df' is your DataFrame and 'columns_to_scale' is a list of numerical column names
# Replace these with your actual variable names
df_scaled = scale_dataset(data, columns_to_scale=['laufzeit', 'hoehe', 'beszeit', 'rate', 'wohnzeit', 'alter', 'pers'])

In [8]:
# separating the target variable
X = df_scaled.drop('kredit', axis=1)
y = df_scaled['kredit']

In [12]:
# function to balance the dataset
from imblearn.over_sampling import SMOTE
def balance_dataset(X, y):
    """
    Balance the dataset using SMOTE.

    Parameters:
    - X: DataFrame, the input dataset
    - y: Series, the target variable

    Returns:
    - DataFrame, the balanced dataset
    - Series, the balanced target variable
    """
    smote = SMOTE(random_state=0)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    return X_balanced, y_balanced

# balancing the dataset
X_balanced, y_balanced = balance_dataset(X, y)

In [13]:
# split the dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=0)

In [17]:
# function to train the model

# imporing various classification model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# importing various evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# initializing the models
log_reg = LogisticRegression()
dec_tree = DecisionTreeClassifier()
rand_forest = RandomForestClassifier()
svm = SVC()

# function to train the model, predict on test set and evaluate the model
def train_model(model, X_train, y_train, X_test, y_test):
    """
    Train the specified model, predict on test set and evaluate the model.

    Parameters:
    - model: the model to be trained
    - X_train: DataFrame, the training set
    - y_train: Series, the training target variable
    - X_test: DataFrame, the test set
    - y_test: Series, the test target variable

    Returns:
    - DataFrame, the balanced dataset
    - Series, the balanced target variable
    """
    # Train the model
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
    print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
    print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
    print('F1: {:.2f}'.format(f1_score(y_test, y_pred)))
    print('AUC: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))


In [19]:
# using for loop to train and evaluate the model
for model in [log_reg, dec_tree, rand_forest, svm]:
    print('\n', type(model).__name__)
    train_model(model, X_train, y_train, X_test, y_test)


 LogisticRegression
Accuracy: 0.73
Precision: 0.74
Recall: 0.70
F1: 0.72
AUC: 0.73
Confusion Matrix:
 [[110  34]
 [ 41  95]]

 DecisionTreeClassifier
Accuracy: 0.75
Precision: 0.74
Recall: 0.74
F1: 0.74
AUC: 0.75
Confusion Matrix:
 [[109  35]
 [ 36 100]]

 RandomForestClassifier
Accuracy: 0.82
Precision: 0.82
Recall: 0.81
F1: 0.81
AUC: 0.82
Confusion Matrix:
 [[120  24]
 [ 26 110]]

 SVC
Accuracy: 0.77
Precision: 0.78
Recall: 0.74
F1: 0.76
AUC: 0.77
Confusion Matrix:
 [[115  29]
 [ 35 101]]


In [20]:
# hyperparameter tuning random forest classifier using GridSearchCV
from sklearn.model_selection import GridSearchCV
def tune_model(model, X_train, y_train):
    """
    Tune the specified model using GridSearchCV.

    Parameters:
    - model: the model to be tuned
    - X_train: DataFrame, the training set
    - y_train: Series, the training target variable

    Returns:
    - GridSearchCV, the tuned model
    """
    # Define hyperparameter grid
    param_grid = {'n_estimators': [100, 200, 300, 400, 500],
                  'max_depth': [1, 3, 5, 7, 9],
                  'min_samples_leaf': [1, 2, 3, 4, 5]}

    # Define grid search
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)

    # Train grid search
    grid_search.fit(X_train, y_train)

    return grid_search

In [21]:
# tuning the random forest classifier
rand_forest_tuned = tune_model(rand_forest, X_train, y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


In [22]:
# printing the best parameters
print(rand_forest_tuned.best_params_)

{'max_depth': 9, 'min_samples_leaf': 1, 'n_estimators': 500}


In [23]:
# use the best parameters to train the model
rand_forest_tuned = rand_forest_tuned.best_estimator_
rand_forest_tuned.fit(X_train, y_train)

In [24]:
# predict on test set
y_pred = rand_forest_tuned.predict(X_test)

In [25]:
# evaluate the model
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
print('F1: {:.2f}'.format(f1_score(y_test, y_pred)))
print('AUC: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Accuracy: 0.81
Precision: 0.81
Recall: 0.79
F1: 0.80
AUC: 0.81
Confusion Matrix:
 [[119  25]
 [ 29 107]]


In [28]:
# save the model into artifacts folder 
import pickle
pickle.dump(rand_forest_tuned, open('/workspaces/predict-bank-credit-risk/artifacts/model.pkl', 'wb'))