### Loading the dataset

In [15]:
import pandas as pd

olist_orders_dataset_df = pd.read_csv('temp/olist_orders_dataset_df.csv')


In [16]:
X = olist_orders_dataset_df.drop(['anomaly'], axis=1)

### Normalizing

In [17]:
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer(output_distribution='normal')
X = qt.fit_transform(X)


### Feature Scaling

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)


In [19]:
df = pd.DataFrame(X)
df = pd.concat([df, olist_orders_dataset_df['anomaly']], axis=1)


### Splitting the dataset

In [20]:
from sklearn.model_selection import train_test_split


In [21]:
X = df.drop(['anomaly'], axis=1)
y = df['anomaly'].map({True: -1, False: 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()


### Import Necessary Libraries:

In [22]:
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix


### Create and Train One-Class SVM Model

In [23]:
# Initialize One-Class SVM
clf = OneClassSVM(gamma='auto', nu=0.05)  # nu is an approximation of outlier fraction

# Fit the model
clf.fit(X_train)


### Predict Anomalies

In [24]:
# Predict using the trained model
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)  # Only if you have test data

# Values of -1 indicate anomalies


### Evaluate the Model

In [25]:
confusion_matrix(y_test, y_pred_test)

array([[   63,  1036],
       [  953, 18404]])

In [26]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

          -1       0.06      0.06      0.06      1099
           1       0.95      0.95      0.95     19357

    accuracy                           0.90     20456
   macro avg       0.50      0.50      0.50     20456
weighted avg       0.90      0.90      0.90     20456
