In [3]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score
import matplotlib.pyplot as plt

In [4]:
X_train = pd.read_csv('../output/split-dataset/X_train.csv')
y_train = pd.read_csv('../output/split-dataset/y_train.csv')
X_validation = pd.read_csv('../output/split-dataset/X_validation.csv')
y_validation = pd.read_csv('../output/split-dataset/y_validation.csv')
X_test = pd.read_csv('../output/split-dataset/X_test.csv')
y_test = pd.read_csv('../output/split-dataset/y_test.csv')

In [None]:
categorical_features = [
    'transaction_type', 'merchant_category', 'location', 'device_used',
    'payment_channel', 'time_of_day', 'type_device_interaction',
    'channel_merchant_interaction'
]

for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_validation[col] = X_validation[col].astype('category')
    X_test[col] = X_test[col].astype('category')
    
y_train['is_fraud'] = y_train['is_fraud'].astype(int)
y_validation['is_fraud'] = y_validation['is_fraud'].astype(int)
y_test['is_fraud'] = y_test['is_fraud'].astype(int)

y_train = y_train['is_fraud']
y_validation = y_validation['is_fraud']
y_test = y_test['is_fraud']

In [None]:
# Create LightGBM Dataset objects
train_data = lgb.Dataset(
    X_train, 
    label=y_train, 
    categorical_feature=categorical_features, 
    free_raw_data=False
)
validation_data = lgb.Dataset(
    X_validation, 
    label=y_validation, 
    categorical_feature=categorical_features,
    free_raw_data=False
)

# Define all parameters in a single dictionary
scale_pos_weight = y_train.value_counts().iloc[0] / y_train.value_counts().iloc[1]
params = {
    'objective': 'binary',
    'eval_metric': 'aucpr',
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'n_jobs': -1,
    'boosting_type': 'gbdt',
    'verbose': -1
}

# Train the model
print("Training LightGBM model with native API...")
model = lgb.train(
    params,
    train_set=train_data,
    num_boost_round=10000,
    valid_sets=[validation_data],
    valid_names=['validation'],
    callbacks=[lgb.early_stopping(100, verbose=True)]
)

print("\nModel training successful!")

Training LightGBM model with native API...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	validation's binary_logloss: 0.19608

Model training successful!


In [35]:
# Predict probabilities on the test set
y_pred_proba = model.predict(X_test)

# Calculate metric scores
auc_roc = roc_auc_score(y_test, y_pred_proba)
auc_pr = average_precision_score(y_test, y_pred_proba)

print(f"Test Set AUC-ROC: {auc_roc:.4f}")
print(f"Test Set AUCPR (Average Precision): {auc_pr:.4f}")

Test Set AUC-ROC: 0.6095
Test Set AUCPR (Average Precision): 0.0458
