In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

In [26]:
np.random.seed(42)

# Gerando base com 04 variaveis - distribuicao normal com media 0 e desvio 1
n_samples = 300
X_normal = np.random.normal(loc=0, scale=1, size=(n_samples, 4))

# Colocando 10 outliers (distribuicao uniforme entre -5 e 5) nas variaveis
n_outliers = 10
X_outliers = np.random.uniform(low=-5, high=5, size=(n_outliers, 4))

# Unindo os outliers a base
X = np.vstack([X_normal, X_outliers])

# Mostrando o dataframe final
df = pd.DataFrame(X, columns=['X1', 'X2', 'X3', 'X4'])

In [27]:
df.head()

Unnamed: 0,X1,X2,X3,X4
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,-0.463418,-0.46573
3,0.241962,-1.91328,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304


In [28]:
# Numero de vizinhos mais proximos que vamos calcular para cada observacao
K = 10

# Aplicando o algoritmo
nbrs = NearestNeighbors(n_neighbors=K, metric='euclidean').fit(X)

# Retorna a distância de cada vizinho mais proximos calculado de cada ponto
distances, _ = nbrs.kneighbors(X)

# Calcula a media da distancias dos K vizinhos mais proximos de cada ponto
knn_distance = distances.mean(axis=1) 

In [29]:
# Define um criterio para selecionar os outliers, neste caso, os 5% dos pontos que estão mais longes dos seus vizinhos mais proximos
threshold = np.percentile(knn_distance, 95)

# Seleciona os outliers (indices)
outlier_indices = np.where(knn_distance > threshold)[0]

In [30]:
outlier_indices

array([ 52,  55,  65, 167, 265, 275, 290, 300, 301, 302, 303, 304, 305,
       306, 307, 309], dtype=int64)

In [31]:
X[outlier_indices]

array([[ 0.51504769,  3.85273149,  0.57089051,  1.13556564],
       [ 2.31465857, -1.86726519,  0.68626019, -1.61271587],
       [-0.92693047, -0.05952536, -3.24126734, -1.02438764],
       [-2.65096981,  1.09150685,  1.24608519, -2.07339023],
       [ 0.77169871, -2.84854262,  1.1487657 , -1.73971378],
       [ 0.99801011, -2.89625538,  2.0883747 , -0.13958963],
       [-2.49940571,  2.29094257, -1.38957247, -1.64539875],
       [-4.56587467,  1.33151376,  4.51403342,  1.0161182 ],
       [ 3.19188859,  3.84206463, -2.71920228, -2.8795516 ],
       [ 1.10980989, -0.8897153 ,  3.39861303,  4.00023123],
       [-1.46578621, -2.63129441,  2.80525515, -2.25193964],
       [ 3.22614319, -0.76261746,  1.67549899, -4.04464686],
       [ 1.23859325, -0.48232321,  0.86608463, -3.31985792],
       [ 2.36873745,  3.62797078, -2.832602  , -4.04285444],
       [-4.76361414,  1.419715  ,  1.07094036,  0.46697413],
       [ 4.8778552 , -3.63560247,  1.95144554, -0.95681232]])