In [38]:
# This code shows the required python packages
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score, confusion_matrix
from lightgbm import LGBMClassifier
import hyperopt
from hyperopt import hp
import joblib
import warnings
warnings.filterwarnings("ignore")


In [39]:

# Specify the type of features to use: either all metrics or derived indicators
DEMorAll = "AllMetrics"  # Options: DemDerivedIndicators OR AllMetrics

# Load the dataset
data01 = pd.read_csv('1844Points.csv', encoding='gbk')

# Rename columns for better understanding
data01.columns = ['FID', 'X', 'Y', 'Aspect', 'Elevation', 'Distance to lineament', 'Lineament density', 
                  'NDVI', 'Plan curvature', 'Profile curvature', 'Slope', 'Slope length', 'STI',
                  'SPI', 'TPI', 'TWI', 'VRM', 'LULC', 'Habitat', 'GDP',
                  'Distance from River', 'Distance from Road', 'target']

# Drop unnecessary columns based on the selected feature type
columns_to_drop = ['FID', 'X', 'Y'] if DEMorAll == 'AllMetrics' else ['FID', 'X', 'Y', 
                                                                     'LULC', 'Habitat', 'GDP', 'NDVI', 'Distance from Road']
data01.drop(columns=columns_to_drop, inplace=True)  # Remove specified columns from the dataset
    
    

    
# Define output file and model paths
filename = 'Output\light_{}.txt'.format(DEMorAll)
model_path = 'Output\light_{}.model'.format(DEMorAll)

max_evals = 10  # Number of evaluations for hyperparameter tuning
seed = 4  # Random seed for reproducibility
n_splits = 4  # Number of folds for K-Fold cross-validation

In [40]:
# Save the column names
header = data01.columns.tolist()

# Split the dataset into two dataframes based on the target value
data_0 = data01.loc[data01['target'] == 0]  # Rows with target = 0
data_1 = data01.loc[data01['target'] == 1]  # Rows with target = 1

# Split target = 0 data into 80% training and 20% validation sets
data_0_X = data_0.drop(columns=["target"], axis=1)
data_0_Y = data_0.target
train_0_X, valid_0_X, train_0_y, valid_0_y = train_test_split(data_0_X, data_0_Y, test_size=0.2, random_state=seed)
save_TrainDate_0 = pd.DataFrame(np.column_stack([train_0_X, train_0_y]), columns=header)
save_ValidDate_0 = pd.DataFrame(np.column_stack([valid_0_X, valid_0_y]), columns=header)

# Split target = 1 data into 80% training and 20% validation sets
data_1_X = data_1.drop(columns=["target"], axis=1)
data_1_Y = data_1.target
train_1_X, valid_1_X, train_1_y, valid_1_y = train_test_split(data_1_X, data_1_Y, test_size=0.2, random_state=seed)
save_TrainDate_1 = pd.DataFrame(np.column_stack([train_1_X, train_1_y]), columns=header)
save_ValidDate_1 = pd.DataFrame(np.column_stack([valid_1_X, valid_1_y]), columns=header)

# Combine training datasets and shuffle the data to avoid ordering bias
train_date = pd.concat([save_TrainDate_0, save_TrainDate_1])
train_date = train_date.sample(frac=1, random_state=42)

# Combine validation datasets and shuffle the data to avoid ordering bias
valid_date = pd.concat([save_ValidDate_0, save_ValidDate_1])
valid_date = valid_date.sample(frac=1, random_state=42)

# Separate features (X) and target (y) from the training dataset
train_y = train_date.target
train_X = train_date.drop(columns=[ "target"], axis=1)

# Separate features (X) and target (y) from the validation dataset
valid_y = valid_date.target
valid_X = valid_date.drop(columns=["target"], axis=1)


In [41]:

# Define cross-validation function
def cross_validation(model_params, train_X_fold, train_y_fold, valid_X_fold, valid_y_fold):
    gbm = LGBMClassifier(**model_params)
    gbm.fit(train_X_fold, train_y_fold, eval_set=[(train_X_fold, train_y_fold), 
                                                  (valid_X_fold, valid_y_fold)], verbose=100)
    best_score = gbm.best_score_['valid_1']['auc']
    return 1 - best_score  # Return 1 - AUC as hyperopt minimizes the objective

# Define the hyperparameter optimization objective function
def hyperopt_objective(params):
    print("*" * 30)
    cur_param = {
        'objective': 'binary',
        'early_stopping_rounds': 200,
        'metric': 'auc',
        'importance_type': 'gain',
        'max_depth': params['max_depth'],
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree'],
        'reg_alpha': params['reg_alpha'],
        'reg_lambda': params['reg_lambda'],
        'num_leaves': params['num_leaves'],
        'learning_rate': params['learning_rate'],
        'boosting_type': params['boosting_type'],
        'bagging_freq': params['bagging_freq'],
        'bagging_fraction': params['bagging_fraction'],
        'feature_fraction': params['feature_fraction']
    }
    res = 0
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for train_index, valid_index in kf.split(train_X):
        train_X_fold, train_y_fold = train_X.iloc[train_index], train_y.iloc[train_index]
        valid_X_fold, valid_y_fold = train_X.iloc[valid_index], train_y.iloc[valid_index]
        res += cross_validation(cur_param, train_X_fold, train_y_fold, valid_X_fold, valid_y_fold)
    res /= n_splits
    print("Current best 1-AUC score is: {}, AUC score is: {}".format(res, 1 - res))
    return res  # Minimize objective

# Define hyperparameter search space
params_space = {
    'objective': 'binary',
    'metric': 'auc',
    'importance_type': 'gain',
    'max_depth': hp.choice('max_depth', range(1, 5)),
    'subsample': hp.choice('subsample', [0.8, 0.9, 1.0]),
    'colsample_bytree': hp.choice('colsample_bytree', [0.8, 0.9, 1.0]),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(1000)),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(1000)),
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'rf']),
    'num_leaves': hp.choice('num_leaves', range(15, 128)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'bagging_freq': hp.choice('bagging_freq', range(4, 7)),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 0.9),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 0.9)
}

# Initialize hyperopt trials
trials = hyperopt.Trials()

# Perform hyperparameter optimization
best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=max_evals,
    trials=trials)

print("Best parameters")
print(best)

# Extract the best parameters and add additional settings
best_params = hyperopt.space_eval(params_space, best)
best_params['objective'] = 'binary'
best_params['metric'] = 'auc'
best_params['num_iterations'] = 500
best_params['early_stopping_rounds'] = 200
best_params['importance_type'] = 'split'


******************************                                                                                         
[100]	training's auc: 0.873306	valid_1's auc: 0.830317                                                                 
[100]	training's auc: 0.867624	valid_1's auc: 0.847579                                                                 
[100]	training's auc: 0.864162	valid_1's auc: 0.862321                                                                 
[100]	training's auc: 0.859909	valid_1's auc: 0.878079                                                                 
Current best 1-AUC score is: 0.14542590091066435, AUC score is: 0.8545740990893357                                     
******************************                                                                                         
[100]	training's auc: 0.883771	valid_1's auc: 0.837232                                                                 
[100]	training's auc: 0.882502	valid_1's

In [42]:
# Train the final model with the best parameters
light_model = LGBMClassifier(**best_params)
light_model.fit(train_X, train_y, eval_set=[(train_X, train_y), (valid_X, valid_y)])

# Save the trained model
joblib.dump(light_model, model_path)
print("Model successfully saved at:", model_path)

# Predict and evaluate the model
y_pred1 = light_model.predict_proba(valid_X)[:, 1]
auc1 = roc_auc_score(valid_y, y_pred1)
y_pred1 = (y_pred1 >= 0.5) * 1
a = confusion_matrix(valid_y, y_pred1).tolist()
a0 = str(a[0])
a1 = str(a[1])

# Calculate evaluation metrics
Kfold_auc = 1 - trials.best_trial['result']['loss']
K_AUC = str('K_AUC: %.4f' % Kfold_auc)
Precesion = str('Precision: %.4f' % metrics.precision_score(valid_y, y_pred1))
Recall = str('Recall: %.4f' % metrics.recall_score(valid_y, y_pred1))
F1_score = str('F1-score: %.4f' % metrics.f1_score(valid_y, y_pred1))
Accuracy = str('Accuracy: %.4f' % metrics.accuracy_score(valid_y, y_pred1))
AUC = str('AUC: %.4f' % auc1)
AP = str('AP: %.4f' % metrics.average_precision_score(valid_y, y_pred1))
Log_loss = str('Log_loss: %.4f' % metrics.log_loss(valid_y, y_pred1, eps=1e-15, normalize=True, sample_weight=None, labels=None))
kappa_score = str('Kappa_score: %.4f' % metrics.cohen_kappa_score(valid_y, y_pred1))
confusion_matrix = f'{a0}\n{a1}\n'
metrics = f'{K_AUC}\n{AUC}\n{Precesion}\n{Recall}\n{F1_score}\n{Accuracy}\n{AP}\n{Log_loss}\n{kappa_score}\n'

# Process feature importance
my_dict = dict(zip(train_X.columns, light_model.feature_importances_))
sorted_dict = dict(sorted(my_dict.items(), key=lambda item: item[1], reverse=True))
total = sum(sorted_dict.values())
dict1 = {key: (value / total) * 100 for key, value in sorted_dict.items()}

# Save evaluation metrics and feature importance to a file
with open(filename, 'w') as f:
    f.write('---------Confusion Matrix---------\n')
    f.write(confusion_matrix)
    f.write('--------Evaluation Metrics--------\n')
    f.write(metrics)
    f.write('-------Feature Importance---------\n')
    for key, value in dict1.items():
        f.write(f'{key}: {value:.2f}\n')
    f.write('----------Best Parameters---------\n')
    f.write(str(best_params))
    f.write('\n')
    seed_str = f'seed = {seed}'
    f.write('----------Seed Value--------------\n')
    f.write(seed_str)

[1]	training's auc: 0.782396	valid_1's auc: 0.752885
[2]	training's auc: 0.788338	valid_1's auc: 0.74504
[3]	training's auc: 0.835214	valid_1's auc: 0.794492
[4]	training's auc: 0.835642	valid_1's auc: 0.794872
[5]	training's auc: 0.840854	valid_1's auc: 0.796713
[6]	training's auc: 0.839118	valid_1's auc: 0.793543
[7]	training's auc: 0.838559	valid_1's auc: 0.791775
[8]	training's auc: 0.838743	valid_1's auc: 0.792082
[9]	training's auc: 0.83812	valid_1's auc: 0.790855
[10]	training's auc: 0.837509	valid_1's auc: 0.790153
[11]	training's auc: 0.855254	valid_1's auc: 0.810679
[12]	training's auc: 0.854492	valid_1's auc: 0.810037
[13]	training's auc: 0.858127	valid_1's auc: 0.815486
[14]	training's auc: 0.859633	valid_1's auc: 0.812885
[15]	training's auc: 0.862384	valid_1's auc: 0.817648
[16]	training's auc: 0.86433	valid_1's auc: 0.820307
[17]	training's auc: 0.865078	valid_1's auc: 0.821037
[18]	training's auc: 0.865379	valid_1's auc: 0.821169
[19]	training's auc: 0.865145	valid_1's 

[178]	training's auc: 0.887443	valid_1's auc: 0.839109
[179]	training's auc: 0.887489	valid_1's auc: 0.838758
[180]	training's auc: 0.887483	valid_1's auc: 0.838641
[181]	training's auc: 0.887445	valid_1's auc: 0.838466
[182]	training's auc: 0.887888	valid_1's auc: 0.839357
[183]	training's auc: 0.887817	valid_1's auc: 0.839416
[184]	training's auc: 0.887719	valid_1's auc: 0.839153
[185]	training's auc: 0.887629	valid_1's auc: 0.839123
[186]	training's auc: 0.88769	valid_1's auc: 0.839328
[187]	training's auc: 0.887725	valid_1's auc: 0.839562
[188]	training's auc: 0.887779	valid_1's auc: 0.839474
[189]	training's auc: 0.887733	valid_1's auc: 0.839503
[190]	training's auc: 0.887685	valid_1's auc: 0.839386
[191]	training's auc: 0.887943	valid_1's auc: 0.839825
[192]	training's auc: 0.888284	valid_1's auc: 0.840146
[193]	training's auc: 0.888402	valid_1's auc: 0.840526
[194]	training's auc: 0.888358	valid_1's auc: 0.840497
[195]	training's auc: 0.888349	valid_1's auc: 0.840584
[196]	train

[425]	training's auc: 0.902019	valid_1's auc: 0.851512
[426]	training's auc: 0.902069	valid_1's auc: 0.8516
[427]	training's auc: 0.902406	valid_1's auc: 0.851395
[428]	training's auc: 0.902351	valid_1's auc: 0.851395
[429]	training's auc: 0.902487	valid_1's auc: 0.851512
[430]	training's auc: 0.902461	valid_1's auc: 0.851454
[431]	training's auc: 0.90245	valid_1's auc: 0.851454
[432]	training's auc: 0.902572	valid_1's auc: 0.851308
[433]	training's auc: 0.902555	valid_1's auc: 0.851191
[434]	training's auc: 0.902533	valid_1's auc: 0.85122
[435]	training's auc: 0.902483	valid_1's auc: 0.851161
[436]	training's auc: 0.902699	valid_1's auc: 0.851278
[437]	training's auc: 0.902715	valid_1's auc: 0.850957
[438]	training's auc: 0.902701	valid_1's auc: 0.850869
[439]	training's auc: 0.90268	valid_1's auc: 0.85084
[440]	training's auc: 0.902765	valid_1's auc: 0.850957
[441]	training's auc: 0.903023	valid_1's auc: 0.850489
[442]	training's auc: 0.903078	valid_1's auc: 0.850548
[443]	training's