# Novelty detection

The training data is not polluted by outliers and we are interested in detecting whether a new observation is an outlier. In this context an outlier is also called a novelty.

[2.7. Novelty and Outlier Detection](https://scikit-learn.org/stable/modules/outlier_detection.html)  
[One-class SVM with non-linear kernel (RBF)](https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html#sphx-glr-auto-examples-svm-plot-oneclass-py)

In [1]:
import numpy as np
import random as rd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
from scipy import stats

## Loading Dataset

In [4]:
input_header = {"PROC_TRACEINFO" : "id",
                "OP070_V_1_angle_value": "angle_1",
                "OP090_SnapRingPeakForce_value" : "snap_ring_peak_force",
                "OP070_V_2_angle_value" : "angle_2",
                "OP120_Rodage_I_mesure_value" : "rodage_i",
                "OP090_SnapRingFinalStroke_value" : "snap_ring_final_stroke",
                "OP110_Vissage_M8_torque_value" : "vissage_m8_torque",
                "OP100_Capuchon_insertion_mesure" : "capuchon_insertion",
                "OP120_Rodage_U_mesure_value" : "rodage_u",
                "OP070_V_1_torque_value" : "torque_1",
                "OP090_StartLinePeakForce_value" : "start_line_peak_force",
                "OP110_Vissage_M8_angle_value" : "vissage_m8_angle",
                "OP090_SnapRingMidPointForce_val" : "snap_ring_midpoint_force",
                "OP070_V_2_torque_value" : "torque_2"}
output_header = {"PROC_TRACEINFO" : "id",
                 "Binar OP130_Resultat_Global_v" : "result"}

train_input = pd.read_csv("data/train_inputs.csv", header=0).rename(columns=input_header)
train_input = train_input[train_input.columns[~train_input.columns.isin(["id", "capuchon_insertion"])]]
train_output = pd.read_csv("data/train_output.csv", header=0).rename(columns=output_header)

# Select index
defect_index = train_output.index[train_output["result"] == 1].tolist()
valid_index = train_output.index[train_output["result"] == 0].tolist()

train_input_valid = train_input.iloc[valid_index,:]
train_input_defect = train_input.iloc[defect_index,:]

training_index = np.random.choice(len(valid_index)-1, size=int(0.8*len(valid_index)), replace=False) # 80% training / 20% testing
testing_index = []
for i in range(len(valid_index)):
    if i not in training_index:
        testing_index.append(i)
X_train = train_input_valid.iloc[training_index,:]
X_test = train_input_valid.iloc[testing_index,:]


In [5]:
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) # nu: corresponds to the probability of finding a new, but regular, observation outside the frontier

clf.fit(X_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_defect = clf.predict(train_input_defect)

n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_defect[y_pred_defect == 1].size

In [6]:
n_error_train

11634

In [7]:
n_error_test

6717

In [8]:
n_error_outliers

10