In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score, precision_score

In [2]:
# Load the dataset
file_path = './_labeled_features/features_15S.csv.gz'
data = pd.read_csv(file_path, compression='gzip')


# Select the specified features and the target variable
features = ['std_rush_order', 'avg_rush_order', 'std_trades', 'std_volume', 
            'avg_volume', 'std_price', 'avg_price', 'avg_price_max']
X = data[features]
y = data['gt']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
value_counts = y_train.value_counts()
print(value_counts)
print(
    f"Fraction of Anomalies: {value_counts[1] / (value_counts.iloc[1]+value_counts.iloc[0])}"
)

gt
0    467032
1       251
Name: count, dtype: int64
Fraction of Anomalies: 0.0005371477241842737


# Gaussian Naive Bayes

In [4]:

# Initialize the Gaussian Naive Bayes classifier and train the model on the training data
model = GaussianNB()
model.fit(X_train, y_train)

# Predict the labels for the testing set
y_pred = model.predict(X_test)


# Print the results
print("Classification Report:\n",  classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy: ",  accuracy_score(y_test, y_pred))
print("Precision: ",  precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1_Macro: " , f1_score(y_test, y_pred, average='macro'))
print("F1_Micro: " , f1_score(y_test, y_pred, average='micro'))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    116755
           1       0.06      0.95      0.11        66

    accuracy                           0.99    116821
   macro avg       0.53      0.97      0.55    116821
weighted avg       1.00      0.99      0.99    116821

Confusion Matrix:
 [[115700   1055]
 [     3     63]]
Accuracy:  0.9909434091473279
Precision:  0.05635062611806798
Recall:  0.9545454545454546
F1_Macro:  0.5509337795516912
F1_Micro:  0.990943409147328


# NB with NSD Data

In [7]:
# Load the dataset
file_path = './_labeled_features/features_15S.csv.gz'
lm_data = pd.read_csv(file_path, compression='gzip')
nsd_data =  pd.read_csv('./_labeled_features/natural_sd/features_15S.csv')

data = pd.concat([lm_data, nsd_data])

# Select the specified features and the target variable
features = ['std_rush_order', 'avg_rush_order', 'std_trades', 'std_volume', 
            'avg_volume', 'std_price', 'avg_price', 'avg_price_max']
X = data[features]
y = data['gt']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gaussian Naive Bayes classifier and train the model on the training data
model = GaussianNB()
model.fit(X_train, y_train)

# Predict the labels for the testing set
y_pred = model.predict(X_test)


# Print the results
print("Classification Report:\n",  classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy: ",  accuracy_score(y_test, y_pred))
print("Precision: ",  precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1_Macro: " , f1_score(y_test, y_pred, average='macro'))
print("F1_Micro: " , f1_score(y_test, y_pred, average='micro'))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    117354
           1       0.09      0.84      0.16        75

    accuracy                           0.99    117429
   macro avg       0.54      0.92      0.58    117429
weighted avg       1.00      0.99      1.00    117429

Confusion Matrix:
 [[116710    644]
 [    12     63]]
Accuracy:  0.9944136456922906
Precision:  0.0891089108910891
Recall:  0.84
F1_Macro:  0.57916140555307
F1_Micro:  0.9944136456922906


# Try Improving Imbalance with SMOTE

In [5]:
# Initialize SMOTE and apply it to generate synthetic samples in the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train the Gaussian Naive Bayes model on the SMOTE-applied training set
model_smote = GaussianNB()
model_smote.fit(X_train_smote, y_train_smote)

# Predict the labels for the testing set
y_pred_smote = model_smote.predict(X_test)


# Print the results
print("Classification Report:\n",  classification_report(y_test, y_pred_smote))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))
print("Accuracy: ",  accuracy_score(y_test, y_pred_smote))
print("Precision: ",  precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred_smote))
print("F1_Macro: " , f1_score(y_test, y_pred_smote, average='macro'))
print("F1_Micro: " , f1_score(y_test, y_pred_smote, average='micro'))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    116755
           1       0.05      0.95      0.09        66

    accuracy                           0.99    116821
   macro avg       0.52      0.97      0.54    116821
weighted avg       1.00      0.99      0.99    116821

Confusion Matrix:
 [[115493   1262]
 [     3     63]]
Accuracy:  0.9891714674587617
Precision:  0.05635062611806798
Recall:  0.9545454545454546
F1_Macro:  0.5425678107166717
F1_Micro:  0.9891714674587617


# Try computing sample weights

In [6]:
from sklearn.utils.class_weight import compute_sample_weight


# Compute sample weights based on class distribution
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Initialize the Gaussian Naive Bayes classifier
model_weighted = GaussianNB()

# Train the model on the training data with sample weights
model_weighted.fit(X_train, y_train, sample_weight=sample_weights)

# Predict the labels for the testing set
y_pred_weighted = model_weighted.predict(X_test)


# Print the results
print("Classification Report:\n",  classification_report(y_test, y_pred_weighted))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_weighted))
print("Accuracy: ",  accuracy_score(y_test, y_pred_weighted))
print("Precision: ",  precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred_weighted))
print("F1_Macro: " , f1_score(y_test, y_pred_weighted, average='macro'))
print("F1_Micro: " , f1_score(y_test, y_pred_weighted, average='micro'))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    116755
           1       0.05      0.95      0.09        66

    accuracy                           0.99    116821
   macro avg       0.52      0.97      0.54    116821
weighted avg       1.00      0.99      0.99    116821

Confusion Matrix:
 [[115517   1238]
 [     3     63]]
Accuracy:  0.9893769099733781
Precision:  0.05635062611806798
Recall:  0.9545454545454546
F1_Macro:  0.5434149179771947
F1_Micro:  0.9893769099733781
