# Local Outlier Factor

The Local Outlier Factor (LOF) algorithm is an unsupervised anomaly detection method which computes the local density deviation of a given data point with respect to its neighbors. It considers as outliers the samples that have a substantially lower density than their neighbors. 

In [1]:
# load the dataset
from pandas import read_csv
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = read_csv(url, header=None).values

In [2]:
x=df[:,:13]
y=df[:,13:]

In [52]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor


# fit the model for outlier detection (default)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
# use fit_predict to compute the predicted labels of the training samples
# (when LOF is used for outlier detection, the estimator has no predict,
# decision_function and score_samples methods).

y_pred = clf.fit_predict(x)
mask = y_pred != -1

x_final,y_final=x[mask,:],y[mask]
print("len of data before appying LocalOutlierFactor :",len(x))
print("len of data after appying LocalOutlierFactor :",len(x_final))

len of data before appying LocalOutlierFactor : 506
len of data after appying LocalOutlierFactor : 455


# OneClassSVM

In [54]:
from sklearn.svm import OneClassSVM
clf_svm = OneClassSVM(gamma='auto').fit(x)
y_pred_svm=clf.fit_predict(x)
mask_svm=y_pred_svm !=-1
x_clean,y_clean=x[mask_svm,:],y[mask_svm]
print("len of data before appying LocalOutlierFactor :",len(x))
print("len of data after appying LocalOutlierFactor :",len(x_clean))

len of data before appying LocalOutlierFactor : 506
len of data after appying LocalOutlierFactor : 455


# High Dimensional  Outlier Detection

In [16]:
# PCA
from sklearn.decomposition import KernelPCA

n_components = 2

# Create PCA with components number
pca = KernelPCA(n_components = n_components, kernel="rbf", gamma=0.0433,
 fit_inverse_transform=True)
# fit transform with PCA on dataset
pca_dataset_7 = pca.fit_transform(df)
# inverse transform back to regular dataset 
inverse_transform_dataset_7 = pca.inverse_transform(pca_dataset_7)

print("dataset_7 shape",df.shape)
print("pca_dataset_7 shape",pca_dataset_7.shape)
print("inverse_transform_dataset_7 shape",inverse_transform_dataset_7.shape)


dataset_7 shape (506, 14)
pca_dataset_7 shape (506, 2)
inverse_transform_dataset_7 shape (506, 14)


In [31]:
# Check the diffrent between X and the inverse_transform_X
MSE_score = ((df-inverse_transform_dataset_7)**2).sum(axis=1)

In [37]:
#taking indices of 9 rows having larger mse
import numpy as np
ind = np.argpartition(MSE_score, -9)[-9:]
for i in ind:
    print(df[i])
    

[  8.20058   0.       18.1       0.        0.713     5.936    80.3
   2.7792   24.      666.       20.2       3.5      16.94     13.5    ]
[1.3810e-02 8.0000e+01 4.6000e-01 0.0000e+00 4.2200e-01 7.8750e+00
 3.2000e+01 5.6484e+00 4.0000e+00 2.5500e+02 1.4400e+01 3.9423e+02
 2.9700e+00 5.0000e+01]
[4.5900e-02 5.2500e+01 5.3200e+00 0.0000e+00 4.0500e-01 6.3150e+00
 4.5600e+01 7.3172e+00 6.0000e+00 2.9300e+02 1.6600e+01 3.9690e+02
 7.6000e+00 2.2300e+01]
[5.6020e-02 0.0000e+00 2.4600e+00 0.0000e+00 4.8800e-01 7.8310e+00
 5.3600e+01 3.1992e+00 3.0000e+00 1.9300e+02 1.7800e+01 3.9263e+02
 4.4500e+00 5.0000e+01]
[9.6040e-02 4.0000e+01 6.4100e+00 0.0000e+00 4.4700e-01 6.8540e+00
 4.2800e+01 4.2673e+00 4.0000e+00 2.5400e+02 1.7600e+01 3.9690e+02
 2.9800e+00 3.2000e+01]
[1.9657e-01 2.2000e+01 5.8600e+00 0.0000e+00 4.3100e-01 6.2260e+00
 7.9200e+01 8.0555e+00 7.0000e+00 3.3000e+02 1.9100e+01 3.7614e+02
 1.0150e+01 2.0500e+01]
[1.4231e-01 0.0000e+00 1.0010e+01 0.0000e+00 5.4700e-01 6.2540e+00
 8.4