### Loading the dataset

In [2]:
import pandas as pd

olist_orders_dataset_df = pd.read_csv('temp/olist_orders_dataset_df.csv')


In [3]:
X = olist_orders_dataset_df.drop(['anomaly'], axis=1)

### Normalizing

In [4]:
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer(output_distribution='normal')
X = qt.fit_transform(X)


### Feature Scaling

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)


In [6]:
df = pd.DataFrame(X)
df = pd.concat([df, olist_orders_dataset_df['anomaly']], axis=1)


### Splitting the dataset

In [7]:
# Separate the records based on the label
normal_records = df[df['anomaly'] == False]
anomaly_records = df[df['anomaly'] == True]

# Randomly sample an equal number of False records
normal_records_sampled = normal_records.sample(n=len(anomaly_records))

# First DataFrame: Equal number of True and False records
df_test = pd.concat([anomaly_records, normal_records_sampled]).sample(frac=1).reset_index(drop=True)

# Second DataFrame: Remaining False records
df_remaining_normal = normal_records.drop(normal_records_sampled.index).reset_index(drop=True)


In [8]:
X_anomaly = anomaly_records.drop(['anomaly'], axis=1).to_numpy()
X_test = df_test.drop(['anomaly'], axis=1).to_numpy()
y_test = df_test['anomaly'].map({True: -1, False: 1}).to_numpy()
X_normal = df_remaining_normal.drop(['anomaly'], axis=1).to_numpy()
y_normal = df_remaining_normal['anomaly'].map({True: -1, False: 1}).to_numpy()


### Import Necessary Libraries:

In [9]:
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix


### Create and Train One-Class SVM Model

In [10]:
# Initialize One-Class SVM
clf = OneClassSVM(gamma='auto', nu=0.05)  # nu is an approximation of outlier fraction

# Fit the model
clf.fit(X_normal)


### Predict Anomalies

In [11]:
# Predict using the trained model
y_pred_train = clf.predict(X_normal)
y_pred_test = clf.predict(X_test)  # Only if you have test data

# Values of -1 indicate anomalies


### Evaluate the Model

In [12]:
confusion_matrix(y_test, y_pred_test)

array([[ 271, 5233],
       [ 272, 5232]])

In [13]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

          -1       0.50      0.05      0.09      5504
           1       0.50      0.95      0.66      5504

    accuracy                           0.50     11008
   macro avg       0.50      0.50      0.37     11008
weighted avg       0.50      0.50      0.37     11008


In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

table_of_performance = pd.DataFrame(columns=['algorithm', 'confusion matrix', 'accuracy', 'precision', 'recall'])

def add_performance(algorithm, y_test, y_predict):
    global table_of_performance

    cm = confusion_matrix(y_test, y_predict)
    accuracy = accuracy_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    new_row = {'algorithm': algorithm, 'confusion matrix': cm, 'accuracy': accuracy, 'precision': precision, 'recall': recall}

    table_of_performance.loc[len(table_of_performance)] = new_row


add_performance('One-class-SVM', (y_test!=1), (y_pred_test!=1))
table_of_performance

Unnamed: 0,algorithm,confusion matrix,accuracy,precision,recall
0,One-class-SVM,"[[5232, 272], [5233, 271]]",0.499909,0.499079,0.049237
