<a href="https://colab.research.google.com/github/AmanPriyanshu/Discussing_Learning/blob/master/py_SOS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ACKNOWLEDGEMENT:

All references and code usderstanding has been taken from: https://pure.uvt.nl/ws/portalfiles/portal/1517370/Janssens_outlier_11-06-2013.pdf

## UPLOADING DATASET:

In [1]:
from google.colab import files
uploaded = files.upload()

## IMPORTING:

In [2]:
import numpy as np
import pandas as pd

## LOADING DATASET:

In [3]:
data = pd.read_csv('winequalityred.csv')
features = data.columns
data = data.values

x = data.T[:-1]
x = x.T

y = data.T[-1]

print('x',x.shape)
print('y', y.shape)

x (1599, 11)
y (1599,)


# PY-STOCHASTIC-OUTLIER-SELECTION

In [4]:
class Py_SOS:
  def __init__(self, eps=1e-5, max_iter=1e3):
    self.eps = eps
    self.max_iter = max_iter

  def dissimilarity_matrix(self, data):
    dissimilarity_matrix = np.empty([data.shape[0], data.shape[0]])
    for i in range(len(data)):
      for j in range(len(data)):
        d_ij = np.sum(np.square(data[i]-data[j]))
        dissimilarity_matrix[i][j] = d_ij

    return dissimilarity_matrix

  def get_perplexity(self, D_row, variance):
    A_row = np.exp(-D_row * variance)
    sumA = sum(A_row)
    perplexity = np.log(sumA) + variance * np.sum(D_row * A_row) / sumA
    return perplexity, A_row

  def affinity_matrix(self, dMatrix, perplexity):
    eps = self.eps
    (n, _) = dMatrix.shape
    variance_matrix = np.ones(dMatrix.shape[0])
    affinity_matrix = np.zeros(dMatrix.shape)
    logU = np.log(perplexity)
    for i in range(dMatrix.shape[0]):
      variance_min = -np.inf
      variance_max =  np.inf
      d_i = dMatrix[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))]
      (c_perplexity, thisA) = self.get_perplexity(d_i, variance_matrix[i])
      perplexity_diff = c_perplexity - logU
      tries = 0
      while (np.isnan(perplexity_diff) or np.abs(perplexity_diff) > eps) and tries < self.max_iter:
        if np.isnan(perplexity_diff):
          variance_matrix[i] = variance_matrix[i] / 10.0
        elif perplexity_diff > 0:
          variance_min = variance_matrix[i].copy()
          if variance_max == np.inf or variance_max == -np.inf:
            variance_matrix[i] = variance_matrix[i] * 2.0
          else:
            variance_matrix[i] = (variance_matrix[i] + variance_max) / 2.0
        else:
          variance_max = variance_matrix[i].copy()
          if variance_min == np.inf or variance_min == -np.inf:
            variance_matrix[i] = variance_matrix[i] / 2.0
          else:
            variance_matrix[i] = (variance_matrix[i] + variance_min) / 2.0
        (c_perplexity, thisA) = self.get_perplexity(d_i, variance_matrix[i])
        perplexity_diff = c_perplexity - logU
        tries += 1
      affinity_matrix[i, np.concatenate((np.r_[0:i], np.r_[i+1:n]))] = thisA
    return variance_matrix, affinity_matrix

  def binding_matrix(self, aMatrix):
    binding_matrix = aMatrix / aMatrix.sum(axis=1)[:,np.newaxis]
    return binding_matrix

  def outlier_probability(self, bMatrix):
    outlier_matrix = np.prod(1-bMatrix, 0)
    return outlier_matrix

  def sos(self, cleaned_data, perplexity): 
    dMatrix = self.dissimilarity_matrix(cleaned_data)
    var_matrix, aff_matrix = self.affinity_matrix(dMatrix, perplexity)
    bin_matrix = self.binding_matrix(aff_matrix)
    outlier_matrix = self.outlier_probability(bin_matrix)
    return outlier_matrix

## IMPLEMENTING SOS ON THE DATASET:

In [5]:
sos = Py_SOS()
outlier_score = sos.sos(x, x.shape[0]/2)

### Now, here we will be using the fact that the Wine Dataset has most values between 5, 6 and 7. We will be labellng these as 0 and the other outliers as 1s.

In [6]:
labels_true = np.array([0 if y_i >= 5 and y_i <= 7 else 1 for y_i in y])

#### We will be using 0.6 as the threshold for outlier selection

In [7]:
labels_pred = np.array([1 if o_i > 0.6 else 0 for o_i in outlier_score])

Py-SOS was able to detect 4% anomalies however changing different values will help.

In [8]:
print('Accuracy', 1-np.mean(np.abs(labels_pred - labels_true)))

Accuracy 0.9024390243902439


## IMPLEMENTATION:

Copy paste the above class and import it into your own code. It is easy to implement and a really useful One Class Classification Algorithm