In [84]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pandas as pd

### XGBoost with merchant information

In [85]:
# Load data
df_train = pd.read_parquet('train_df.parquet', engine='pyarrow')
df_test = pd.read_parquet('test_df.parquet', engine='pyarrow')

# Separate features X and target y (anomaly label 0 or 1)
X_train = df_train.drop('is_anomalous', axis=1)
y_train = df_train['is_anomalous']
X_test = df_test.drop('is_anomalous', axis=1)
y_test = df_test['is_anomalous']

In [86]:
# use label encoding for categorical/name columns (merchant info)
# label encoding is fine because tree-based models don’t assume ordering in the numbers

# Identify categorical columns
col_types = dict(X_train.dtypes)
label_cols = [col for col, dtype in col_types.items() if dtype == 'object']

# Label encode each categorical column (keeping consistency between train and test)
encoders = {}
for col in label_cols:
    encoder = LabelEncoder()
    combined = pd.concat([X_train[col], X_test[col]], axis=0).astype(str)
    encoder.fit(combined)
    X_train[col] = encoder.transform(X_train[col].astype(str))
    X_test[col] = encoder.transform(X_test[col].astype(str))
    encoders[col] = encoder  # Optional: store encoders for inverse_transform or future use

# remove datetime cols, only use important info (time between invoice date and due: invoice_age)
X_train = X_train.drop(columns=['invoice_date', 'due_date'])
X_test = X_test.drop(columns=['invoice_date', 'due_date'])

In [87]:
model_with_merchant_info = XGBClassifier(
    n_estimators=200,
    eval_metric='logloss',
    random_state=42
)

model_with_merchant_info.fit(X_train, y_train)

# Predict
y_pred = model_with_merchant_info.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8821852731591449

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.97      0.90      1177
           1       0.96      0.77      0.85       928

    accuracy                           0.88      2105
   macro avg       0.90      0.87      0.88      2105
weighted avg       0.89      0.88      0.88      2105



### Model without merchant information

In [88]:
# Load data
df_train = pd.read_parquet('train_df.parquet', engine='pyarrow')
df_test = pd.read_parquet('test_df.parquet', engine='pyarrow')

# Separate features X and target y (anomaly label 0 or 1)
X_train = df_train.drop('is_anomalous', axis=1)
y_train = df_train['is_anomalous']
X_test = df_test.drop('is_anomalous', axis=1)
y_test = df_test['is_anomalous']

In [89]:
# use label encoding for categorical/name columns (po_number, payment_method, country, state, and currency)
# label encoding is fine because tree-based models don’t assume ordering in the numbers

# remove merchant, merchant chain, merchant branch, and merchant address
# remove datetime cols, only use important info (time between invoice date and due: invoice_age)
X_train = X_train.drop(columns=['merchant', 'merchant_branch', 'merchant_chain', 'merchant_address', 'invoice_date', 'due_date'])
X_test = X_test.drop(columns=['merchant', 'merchant_branch', 'merchant_chain', 'merchant_address', 'invoice_date', 'due_date'])

# Identify categorical columns
col_types = dict(X_train.dtypes)
label_cols = [col for col, dtype in col_types.items() if dtype == 'object']

# Label encode each categorical column (keeping consistency between train and test)
encoders = {}
for col in label_cols:
    encoder = LabelEncoder()
    combined = pd.concat([X_train[col], X_test[col]], axis=0).astype(str)
    encoder.fit(combined)
    X_train[col] = encoder.transform(X_train[col].astype(str))
    X_test[col] = encoder.transform(X_test[col].astype(str))
    encoders[col] = encoder  # Optional: store encoders for inverse_transform or future use

In [90]:
model = XGBClassifier(
    n_estimators=200,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8859857482185273

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.97      0.90      1177
           1       0.95      0.78      0.86       928

    accuracy                           0.89      2105
   macro avg       0.90      0.87      0.88      2105
weighted avg       0.89      0.89      0.88      2105

