In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import pickle 

In [2]:

# Load the dataset (replace 'path_to_your_data.csv' with your actual file path)
df = pd.read_csv('C:/Users/elbet/OneDrive/Desktop/Ten/week-6/github-notebook/Credit-Scoring-Model-/data/data/df_woe.csv')
df = df.drop(columns=['TransactionId','SubscriptionId','AccountId','ProductId','BatchId',
                      'CustomerId','CurrencyCode','CountryCode','TransactionStartTime',
                      'FraudResult','TransactionYear','TransactionDay','PricingStrategy','TransactionHour','TransactionMonth'])
print(df.columns)

Index(['RFMS_Score', 'TransactionCount', 'ProductCategory_woe',
       'ChannelId_woe', 'AverageTransactionAmount', 'TransactionAmountStd',
       'RiskLabel', 'Value', 'Amount', 'ProviderId_woe',
       'TotalTransactionAmount', 'Frequency_woe', 'Monetary_woe',
       'Recency_woe', 'Seniority_woe'],
      dtype='object')


In [3]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
df['RiskLabel'] = label_encoder.fit_transform(df['RiskLabel'])

# Split data into features (X) and target variable (y)
X = df.drop(columns=['RiskLabel'])  # Features
y = df['RiskLabel']  # Target variable

# Split the data: 80% training, 10% validation, 10% testing
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42, stratify=y_val)

In [4]:
# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")

    return roc_auc


In [5]:
# Random Forest hyperparameter grid
rf_param_grid = {
    'n_estimators': [150, 300, 600],
    'max_depth': [15, 25, 35, None],
    'min_samples_split': [3, 6, 12],
    'min_samples_leaf': [2, 3, 5],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}


# Early stopping configuration
best_model = None
best_val_score = 0
patience = 3  # Number of epochs with no improvement to wait before stopping
stopping_counter = 0

# Training loop with early stopping for RandomForestClassifier with hyperparameter tuning
for epoch in range(3):  # Adjust the number of epochs as needed
    print(f"\nEpoch {epoch + 1}/{3}")

    # Hyperparameter tuning using GridSearchCV for Random Forest
    rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                                  param_grid=rf_param_grid, cv=5, n_jobs=-1, verbose=1)
    rf_grid_search.fit(X_train, y_train)

    # Get the best model and evaluate on the validation set
    best_rf_model = rf_grid_search.best_estimator_
    val_preds = best_rf_model.predict(X_val)
    val_score = accuracy_score(y_val, val_preds)  # Use appropriate metric

    print(f'Epoch {epoch}, Best Random Forest Validation Accuracy: {val_score}, Best Params: {rf_grid_search.best_params_}')

    # Check if the current model is the best model
    if val_score > best_val_score:
        best_val_score = val_score
        best_model = best_rf_model
        stopping_counter = 0  # Reset counter if there is improvement
    else:
        stopping_counter += 1

    # Early stopping condition
    if stopping_counter >= patience:
        print("Early stopping triggered for Random Forest!")
        break

# After early stopping, the best Random Forest model is used
print("\nBest Random Forest Validation Score Achieved:", best_val_score)

# Evaluate the best Random Forest model on the test set
print("\nBest Random Forest Model Performance on Test Set:")
evaluate_model(best_model, X_test, y_test)

# Save the best Random Forest model to a .pkl file
with open('C:/Users/elbet/OneDrive/Desktop/Ten/week-6/github-notebook/Credit-Scoring-Model-/rf_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("Best Random Forest model saved as 'best_rf_model.pkl'")


Epoch 1/3
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Epoch 0, Best Random Forest Validation Accuracy: 1.0, Best Params: {'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 150}

Epoch 2/3
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Epoch 1, Best Random Forest Validation Accuracy: 1.0, Best Params: {'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 150}

Epoch 3/3
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Epoch 2, Best Random Forest Validation Accuracy: 1.0, Best Params: {'bootstrap': True, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 150}

Best Random Forest Validation Score Achieved: 1.0

Best Random Forest Model Performance on Test Set:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC-AUC: 1.0000
Be

In [7]:
# Gradient Boosting hyperparameter grid
gbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Early stopping for Gradient Boosting with hyperparameter tuning
best_gbm_model = None
best_val_score_gbm = 0
stopping_counter_gbm = 0

for epoch in range(3):  # Adjust the number of epochs as needed
    print(f"\nEpoch {epoch + 1}/{3}")

    # Hyperparameter tuning using RandomizedSearchCV for Gradient Boosting
    gbm_random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                                           param_distributions=gbm_param_grid, n_iter=50, cv=5, random_state=42, n_jobs=-1, verbose=1)
    gbm_random_search.fit(X_train, y_train)

    # Get the best model and evaluate on the validation set
    best_gbm_model = gbm_random_search.best_estimator_
    val_preds_gbm = best_gbm_model.predict(X_val)
    val_score_gbm = accuracy_score(y_val, val_preds_gbm)

    print(f'Epoch {epoch}, Best GBM Validation Accuracy: {val_score_gbm}, Best Params: {gbm_random_search.best_params_}')

    if val_score_gbm > best_val_score_gbm:
        best_val_score_gbm = val_score_gbm
        best_gbm_model = best_gbm_model
        stopping_counter_gbm = 0
    else:
        stopping_counter_gbm += 1

    if stopping_counter_gbm >= patience:
        print("Early stopping triggered for GBM!")
        break

# After early stopping for Gradient Boosting, the best GBM model is used
print("\nBest GBM Validation Score Achieved:", best_val_score_gbm)

# Evaluate the best Gradient Boosting model on the test set
print("\nBest GBM Model Performance on Test Set:")
evaluate_model(best_gbm_model, X_test, y_test)

# Save the best GBM model to a .pkl file
with open('C:/Users/elbet/OneDrive/Desktop/Ten/week-6/github-notebook/Credit-Scoring-Model-/gbm_model.pkl', 'wb') as file:
    pickle.dump(best_gbm_model, file)

print("Best Gradient Boosting model saved as 'best_gbm_model.pkl'")


Epoch 1/3
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Epoch 0, Best GBM Validation Accuracy: 1.0, Best Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 3, 'learning_rate': 0.01}

Epoch 2/3
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Epoch 1, Best GBM Validation Accuracy: 1.0, Best Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 3, 'learning_rate': 0.01}

Epoch 3/3
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Epoch 2, Best GBM Validation Accuracy: 1.0, Best Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 3, 'learning_rate': 0.01}

Best GBM Validation Score Achieved: 1.0

Best GBM Model Performance on Test Set:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC-AUC: 1.0000
Best Gradient Boosting model saved as 'best_gbm_model.pkl'


In [None]:
# Voting Classifier that combines the best Random Forest and Gradient Boosting models
voting_clf = VotingClassifier(estimators=[
    ('rf', best_model),
    ('gbm', best_gbm_model)
], voting='soft')

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

# Evaluate the Voting Classifier
print("\nVoting Classifier Performance:")
evaluate_model(voting_clf, X_test, y_test)

# Save the Voting Classifier to a .pkl file
with open('C:/Users/elbet/OneDrive/Desktop/Ten/week-6/github-notebook/Credit-Scoring-Model-/final_model.pkl', 'wb') as file:
    pickle.dump(voting_clf, file)

print("Voting Classifier model saved as 'ensemble_voting_model.pkl'")
