In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (roc_auc_score, accuracy_score, f1_score, confusion_matrix)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
import seaborn as sns

D:\anaconda\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
D:\anaconda\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
  from pandas.core import (
Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
import random

In [3]:
# Set the global random seed
def set_global_seed(seed_value):
    random.seed(seed_value)  # Setting Python's random seed
    np.random.seed(seed_value)  # Set NumPy random seed

set_global_seed(66)

In [4]:
# Loading data
data = pd.read_csv('xxx.csv')
x = data.drop(columns=['xxx']).values
y = data['xxx']

In [5]:
# Split into training and testing sets
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)

# Data balance

In [6]:
from collections import Counter
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import BorderlineSMOTE, KMeansSMOTE, RandomOverSampler, SVMSMOTE, SMOTE
from sklearn.model_selection import train_test_split

In [None]:
# 1. Use BorderlineSMOTE
borderline_smote = BorderlineSMOTE(random_state=0)
train_x_borderline, train_y_borderline = borderline_smote.fit_resample(train_x, train_y)
print('BorderlineSMOTE Resampled dataset shape %s' % Counter(train_y_borderline))

In [8]:
train_x.shape,test_x.shape,train_y.shape,test_y.shape

((1035, 37), (259, 37), (1035,), (259,))

In [None]:
train_x=train_x_borderline
train_y=train_y_borderline

In [10]:
train_x.shape,test_x.shape,train_y.shape,test_y.shape

((1262, 37), (259, 37), (1262,), (259,))

# Hyperparameter optimisation

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

In [None]:
# Define the hyperparameter search range
param_grids = {
    "Logistic Regression": {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']  # l1 regularisation requires the “liblinear” solver.
    },
    "Decision Tree": {
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Random Forest": {
        'n_estimators': [100, 200, 500],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Extra Trees": {
        'n_estimators': [100, 200, 500],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "LightGBM": {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [10, 20, 30],
        'num_leaves': [31, 50, 100]
    },
    "XGBoost": {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [6, 10, 15],
        'colsample_bytree': [0.7, 0.8, 1.0]
    },
    "CatBoost": {
        'iterations': [500, 1000],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [6, 10, 15]
    },
    "Neural Network": {
        'hidden_layer_sizes': [(100,), (50, 50), (100, 50)],
        'learning_rate_init': [0.001, 0.01, 0.1],
        'max_iter': [500, 1000]
    }
}

# Define model dictionary (without optimisation parameters)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(silent=True),
    "Neural Network": MLPClassifier(max_iter=1000)
}

# Optimal parameters and performance for storage
best_models = {}
results = {}

# Perform hyperparameter optimisation on the model
for model_name, model in models.items():
    print(f"Optimizing {model_name}...")

    # Using GridSearchCV for Hyperparameter Optimisation
    search = GridSearchCV(
        model, param_grids[model_name], 
        scoring='roc_auc',  # Using ROC AUC as an evaluation criterion
        n_jobs=-1,  # Utilise all available CPU cores
        cv=5,  # Five-fold cross-validation
        verbose=1  # Output Process
    )
    
    # Train and optimise the model
    search.fit(train_x, train_y)
    
    # Storing optimal models and performance metrics
    best_models[model_name] = search.best_estimator_
    results[model_name] = {
        "Best Params": search.best_params_,
        "Best Score (ROC AUC)": search.best_score_
    }

In [13]:
# Export results to a CSV file
results_df.to_csv('xxx.csv', index=False)

In [None]:
# Evaluate optimized models
test_results = {}
for model_name, model in best_models.items():
    predictions = model.predict(test_x)
    
    # Get predicted probabilities for ROC AUC calculation
    if hasattr(model, "predict_proba"):
        prob_predictions = model.predict_proba(test_x)[:, 1]  # Get the probability of the positive class
    else:
        prob_predictions = model.decision_function(test_x)
    
    # Calculate cross-validation metrics (using the same scoring method)
    cv_accuracy = cross_val_score(model, train_x, train_y, cv=5, scoring='accuracy').mean()
    cv_f1 = cross_val_score(model, train_x, train_y, cv=5, scoring='f1').mean()
    cv_roc_auc = cross_val_score(model, train_x, train_y, cv=5, scoring='roc_auc').mean()
    cv_precision = cross_val_score(model, train_x, train_y, cv=5, scoring='precision').mean()
    cv_recall = cross_val_score(model, train_x, train_y, cv=5, scoring='recall').mean()

    # Generate cross-validation predictions
    cv_predictions = cross_val_predict(model, train_x, train_y, cv=5)

    # Generate confusion matrix from cross-validation
    cv_confusion_matrix = confusion_matrix(train_y, cv_predictions)
    
    # Calculate Precision and Recall on the test set
    test_precision = precision_score(test_y, predictions)
    test_recall = recall_score(test_y, predictions)
    
    test_results[model_name] = {
        "Accuracy": accuracy_score(test_y, predictions),
        "F1 Score": f1_score(test_y, predictions),
        "Precision": test_precision,          # Precision on test set
        "Recall": test_recall,                # Recall on test set
        "ROC AUC": roc_auc_score(test_y, prob_predictions),
        "Confusion Matrix": confusion_matrix(test_y, predictions),
        "CV Accuracy": cv_accuracy,           # Accuracy from cross-validation
        "CV F1 Score": cv_f1,                 # F1 Score from cross-validation
        "CV Precision": cv_precision,         # Precision from cross-validation
        "CV Recall": cv_recall,               # Recall from cross-validation
        "CV ROC AUC": cv_roc_auc,             # ROC AUC from cross-validation
        "CV Confusion Matrix": cv_confusion_matrix  # Confusion matrix from cross-validation
    }


In [None]:
# Plot confusion matrix from cross-validation
def plot_confusion_matrix(conf_matrix, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix for {model_name} (Cross-Validation)')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Assume test_results is the dictionary where you stored model results
for model_name, results in test_results.items():
    cv_conf_matrix = results["CV Confusion Matrix"]
    plot_confusion_matrix(cv_conf_matrix, model_name)

In [None]:
# Export results to a CSV file
test_results_df.to_csv('BorderlineSMOTE-结果.csv', index=False)