In [None]:
# default_exp stats

# stats

> Módulo que contém os algoritmos para detecção de anomalias em dados univariados.

* Robust ZScore
* Tukey

In [None]:
#hide
from nbdev.showdoc import *

## Imports

In [None]:
#export
import numpy as np

## MAD - Robust Zscore

    For anomaly detection the Robust Zscore is more suitable than the standard Zscore.

    Since the mean can be strongly influenced by outliers, Robust Zscore replaces the mean by the median.

In [None]:
#export
class MAD():
    ''' 
    Robust z score implementation.
    
    Robust z score = x − μ1/2 MAD × 1.4826
    '''
    def __init__(self, only_low_values=False):
        self.only_low_values= only_low_values
        self.median = None
        self.mad = None
    
    def __mad(self, x):
        ''' retorna o MAD(Median Absolute Deviation) para cada valor de **x** '''
        return (x - self.median)/self.mad
    
    def fit(self, x):
        ''' Calcula os parametros do Zscore Robusto(Median/MAD) para os valores de **x** '''
        self.mad = 1.4826*np.nanmedian(np.abs(x - np.nanmedian(x)))
        self.median = np.nanmedian(x)

    def predict(self, x, m=3.0):
        ''' retorna se os valores de **x** são outliers '''
        assert m > 0
        assert len(x) > 0
        
        # Calcular MAD
        mad = self.__mad(x)
        
        if self.only_low_values: # Retornando anomalias apenas para os valores menores que -m
            return x[mad < -m]
        else:                    # MAD padrão, valores de anomalias maiores que m ou menores que -m
            return x[np.abs(mad) > m]
        
    def decision_function(self, x):
        ''' retorna se os valores de mad para cada valor em **x**'''
        mad = self.__mad(x)
        
        return mad
    
    def fit_predict(self, x, m=3.0):
        ''' Calcula os parametros e retorno os valores
            de **x** que são outliers'''
        self.fit(x)
        return self.predict(x, m)

In [None]:
show_doc(MAD.fit)

<h4 id="MAD.fit" class="doc_header"><code>MAD.fit</code><a href="__main__.py#L17" class="source_link" style="float:right">[source]</a></h4>

> <code>MAD.fit</code>(**`x`**)

Calcula os parametros do Zscore Robusto(Median/MAD) para os valores de **x** 

In [None]:
show_doc(MAD.predict)

<h4 id="MAD.predict" class="doc_header"><code>MAD.predict</code><a href="__main__.py#L22" class="source_link" style="float:right">[source]</a></h4>

> <code>MAD.predict</code>(**`x`**, **`m`**=*`3.0`*)

retorna se os valores de **x** são outliers 

In [None]:
show_doc(MAD.fit_predict)

<h4 id="MAD.fit_predict" class="doc_header"><code>MAD.fit_predict</code><a href="__main__.py#L41" class="source_link" style="float:right">[source]</a></h4>

> <code>MAD.fit_predict</code>(**`x`**, **`m`**=*`3.0`*)

Calcula os parametros e retorno os valores
de **x** que são outliers

In [None]:
show_doc(MAD.decision_function)

<h4 id="MAD.decision_function" class="doc_header"><code>MAD.decision_function</code><a href="__main__.py#L35" class="source_link" style="float:right">[source]</a></h4>

> <code>MAD.decision_function</code>(**`x`**)

retorna se os valores de mad para cada valor em **x**

### Uso

In [None]:
# test array
x = np.random.randn(1000)
x[:5]

array([ 2.03162821, -1.09820625, -1.10483056,  1.2660762 , -0.19888903])

In [None]:
mad = MAD()
mad.fit_predict(x)

array([ 3.50162285, -3.12469124,  3.22225134,  3.33958115])

In [None]:
mad = MAD(only_low_values=True)
mad.fit_predict(x)

array([-3.12469124])

## Tukey

In [None]:
#export
class Tukey():
    ''' 
    classe responsavel por implemetar Tukey Method
    para detecção de anomalias.
    '''

    def __init__(self, only_low_values=False):
        self.only_low_values = only_low_values
        self.iqr = None
        self.q1 = None
        self.q2 = None
        self.q3 = None
    
    def fit(self, x):
        ''' Calcula os parametros do Tukey(Q1,Q2,Q3) para os valores de **x** '''
        x = np.sort(x) 
        n = len(x)//2
    
        # calculando os quartiles
        self.q1 = np.nanmedian(x[:n])
        self.q2 = np.nanmedian(x)
        self.q3 = np.nanmedian(x[n:])
    
        self.iqr = self.q3 - self.q1
        self.min = self.q1 - 1.5*self.iqr
        self.max = self.q3 + 1.5*self.iqr
        
    def predict(self, x):
        ''' retorna se os valores de **x** são outliers '''
        if self.only_low_values:
            return x[(x < self.min)]
        else:
            return x[(x < self.min) | (x >= self.max)]
   
    def decision_function(self, x):
        ''' retorna o score para os valores de **x** '''
        score = np.zeros(len(x))
        score[x < self.min] = np.abs(x[x < self.min] - self.min)
        score[x > self.max] = np.abs(x[x > self.max] - self.max)
        
        return np.log(score + 1)
    
    def fit_predict(self, x):
        ''' Calcula os parametros e retorno os valores
            de **x** que são outliers'''
        self.fit(x)
        return self.predict(x)

In [None]:
show_doc(Tukey.fit)

<h4 id="Tukey.fit" class="doc_header"><code>Tukey.fit</code><a href="__main__.py#L15" class="source_link" style="float:right">[source]</a></h4>

> <code>Tukey.fit</code>(**`x`**)

Calcula os parametros do Tukey(Q1,Q2,Q3) para os valores de **x** 

In [None]:
show_doc(Tukey.predict)

<h4 id="Tukey.predict" class="doc_header"><code>Tukey.predict</code><a href="__main__.py#L29" class="source_link" style="float:right">[source]</a></h4>

> <code>Tukey.predict</code>(**`x`**)

retorna se os valores de **x** são outliers 

In [None]:
show_doc(Tukey.decision_function)

<h4 id="Tukey.decision_function" class="doc_header"><code>Tukey.decision_function</code><a href="__main__.py#L36" class="source_link" style="float:right">[source]</a></h4>

> <code>Tukey.decision_function</code>(**`x`**)

retorna o score para os valores de **x** 

### Tukey Uso

In [None]:
x = np.arange(10)

tu = Tukey()
tu.fit_predict(x)

array([], dtype=int64)

In [None]:
x[0] = -100
x[9] = 100

tu = Tukey()
tu.fit_predict(x)

array([-100,  100])

In [None]:
tu = Tukey(only_low_values=True)
tu.fit_predict(x)

array([-100])