# Novelty detection

The training data is not polluted by outliers and we are interested in detecting whether a new observation is an outlier. In this context an outlier is also called a novelty.

In [1]:
import numpy as np
import random as rd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt
from scipy import stats

## Loading Dataset

In [7]:
input_header = {"PROC_TRACEINFO" : "id",
                "OP070_V_1_angle_value": "angle_1",
                "OP090_SnapRingPeakForce_value" : "snap_ring_peak_force",
                "OP070_V_2_angle_value" : "angle_2",
                "OP120_Rodage_I_mesure_value" : "rodage_i",
                "OP090_SnapRingFinalStroke_value" : "snap_ring_final_stroke",
                "OP110_Vissage_M8_torque_value" : "vissage_m8_torque",
                "OP100_Capuchon_insertion_mesure" : "capuchon_insertion",
                "OP120_Rodage_U_mesure_value" : "rodage_u",
                "OP070_V_1_torque_value" : "torque_1",
                "OP090_StartLinePeakForce_value" : "start_line_peak_force",
                "OP110_Vissage_M8_angle_value" : "vissage_m8_angle",
                "OP090_SnapRingMidPointForce_val" : "snap_ring_midpoint_force",
                "OP070_V_2_torque_value" : "torque_2"}
output_header = {"PROC_TRACEINFO" : "id",
                 "Binar OP130_Resultat_Global_v" : "result"}

train_input = pd.read_csv("../data/train_inputs.csv", header=0).rename(columns=input_header)
train_input = train_input[train_input.columns[~train_input.columns.isin(["id", "capuchon_insertion"])]]
train_output = pd.read_csv("../data/train_output.csv", header=0).rename(columns=output_header)

# Select index
defect_index = train_output.index[train_output["result"] == 1].tolist()
valid_index = train_output.index[train_output["result"] == 0].tolist()

train_input_valid = train_input.iloc[valid_index,:]
train_input_defect = train_input.iloc[defect_index,:]

training_index = np.random.choice(len(valid_index)-1, size=int(0.8*len(valid_index)), replace=False) # 80% training / 20% testing
testing_index = []
for i in range(len(valid_index)):
    if i not in training_index:
        testing_index.append(i)
X_train = train_input_valid.iloc[training_index,:]
X_test = train_input_valid.iloc[testing_index,:]


Unnamed: 0,angle_1,snap_ring_peak_force,angle_2,rodage_i,snap_ring_final_stroke,vissage_m8_torque,rodage_u,torque_1,start_line_peak_force,vissage_m8_angle,snap_ring_midpoint_force,torque_2
19356,181.7,168.01,164.1,109.09,11.83,12.18,11.97,6.62,28.80,19.8,108.69,6.60
24912,147.5,152.31,153.2,109.88,12.13,12.31,11.97,6.41,18.76,13.0,84.35,6.41
11560,182.7,142.10,173.4,113.80,11.85,12.26,11.97,6.61,25.60,13.9,103.66,6.66
10830,147.0,147.45,155.6,110.33,11.75,12.29,11.97,6.64,26.38,12.5,106.35,6.63
12891,159.5,150.29,150.8,109.03,12.09,12.19,11.98,6.62,22.79,20.3,94.14,6.60
...,...,...,...,...,...,...,...,...,...,...,...,...
7949,136.0,154.67,181.1,113.27,12.13,12.15,11.97,6.41,19.36,18.6,83.03,6.41
17423,145.4,174.38,162.7,115.88,11.87,12.15,11.97,6.63,26.23,21.7,102.25,6.61
996,136.7,155.29,156.8,113.51,12.09,12.28,11.97,6.41,18.36,16.6,83.24,6.41
22614,155.1,144.71,161.3,115.03,11.75,12.25,11.97,6.61,27.54,18.3,107.26,6.62


In [9]:
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) # nu: corresponds to the probability of finding a new, but regular, observation outside the frontier

clf.fit(X_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_defect = clf.predict(train_input_defect)

n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_defect[y_pred_defect == 1].size

In [10]:
n_error_train

11511

In [11]:
n_error_test

6710

In [12]:
n_error_outliers

10