In [4]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

# Load data
df_train = pd.read_parquet('train_df.parquet', engine='pyarrow')
df_test = pd.read_parquet('test_df.parquet', engine='pyarrow')

X_train = df_train.drop('is_anomalous', axis=1)
y_train = df_train['is_anomalous']
X_test = df_test.drop('is_anomalous', axis=1)
y_test = df_test['is_anomalous']

# Identify categorical columns and label-encode them
col_types = dict(X_train.dtypes)
label_cols = [col for col, dtype in col_types.items() if dtype == 'object']

encoders = {}
for col in label_cols:
    encoder = LabelEncoder()
    combined = pd.concat([X_train[col], X_test[col]], axis=0).astype(str)
    encoder.fit(combined)
    X_train[col] = encoder.transform(X_train[col].astype(str))
    X_test[col] = encoder.transform(X_test[col].astype(str))
    encoders[col] = encoder

# Create 'time_until_due' feature and drop date columns
X_train['time_until_due'] = (X_train['due_date'] - X_train['invoice_date']).dt.days
X_test['time_until_due'] = (X_test['due_date'] - X_test['invoice_date']).dt.days
X_train = X_train.drop(columns=['invoice_date', 'due_date'])
X_test = X_test.drop(columns=['invoice_date', 'due_date'])

# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# dimensionality reduction
pca = PCA(n_components=10, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train an SVM with RBF Kernel on the PCA features
model = SVC(kernel='rbf', 
            C=1.0, 
            gamma='scale',
            class_weight='balanced',  # to help with class imbalance
            random_state=42)

model.fit(X_train_pca, y_train)

# Predict and Evaluate on Test Set
y_pred = model.predict(X_test_pca)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8256532066508313

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.98      0.86      1177
           1       0.97      0.62      0.76       928

    accuracy                           0.83      2105
   macro avg       0.87      0.80      0.81      2105
weighted avg       0.86      0.83      0.82      2105



In [None]:
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score, classification_report

# df_train = pd.read_parquet('train_df.parquet', engine='pyarrow')
# df_test = pd.read_parquet('test_df.parquet', engine='pyarrow')

# X_train = df_train.drop('is_anomalous', axis=1)
# y_train = df_train['is_anomalous']
# X_test = df_test.drop('is_anomalous', axis=1)
# y_test = df_test['is_anomalous']

# # Label Encode Categorical Columns
# col_types = dict(X_train.dtypes)
# label_cols = [col for col, dtype in col_types.items() if dtype == 'object']

# encoders = {}
# for col in label_cols:
#     encoder = LabelEncoder()
#     # Combine train & test for consistent mapping
#     combined = pd.concat([X_train[col], X_test[col]], axis=0).astype(str)
#     encoder.fit(combined)
#     X_train[col] = encoder.transform(X_train[col].astype(str))
#     X_test[col] = encoder.transform(X_test[col].astype(str))
#     encoders[col] = encoder

# X_train['time_until_due'] = (X_train['due_date'] - X_train['invoice_date']).dt.days
# X_test['time_until_due'] = (X_test['due_date'] - X_test['invoice_date']).dt.days
# # Drop the raw datetime columns (no longer needed)
# X_train.drop(columns=['invoice_date', 'due_date'], inplace=True)
# X_test.drop(columns=['invoice_date', 'due_date'], inplace=True)

# # Use grid search for best kernel SVM
# param_grid = {
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'C': [0.1, 1, 10, 100, 1000],
#     'gamma': ['scale', 'auto'],
#     'class_weight': ['balanced']  
# }

# svm = SVC(random_state=42)
# grid_search = GridSearchCV(
#     estimator=svm,
#     param_grid=param_grid,
#     scoring='recall',    
#     cv=3,                
#     verbose=1,
#     n_jobs=-1    
# )

# print("Searching for the best SVM configuration...")
# grid_search.fit(X_train, y_train)

# # Retrieve the best model
# best_svm = grid_search.best_estimator_
# print(f"Best Parameters: {grid_search.best_params_}")

# # Final Model Evaluation
# y_pred = best_svm.predict(X_test)

# # accuracy
# print("\nAccuracy on Test Set:", accuracy_score(y_test, y_pred))
# print("\nClassification Report on Test Set:\n", classification_report(y_test, y_pred))