### Dependencies

In [1]:
import sqlite3 # Database library.
import os # Folder management library.
import pickle # Serializing module.
import numpy as np # Scientific computing library.

### Retreive Examples from Database

In [2]:
# Initializing database and cursor.
star_data_db = sqlite3.connect('star_data.db')
star_data_cursor = star_data_db.cursor()

# Retrieving star_data from database.
classes = ['cep_1o', 'cep_f', 'dsct','eb_ec', 'eb_ed', 'eb_esd', 'lpv_mira_agb_c', 'lpv_mira_agb_o', 'lpv_osarg_agb',
           'lpv_osarg_rgb', 'lpv_srv_agb_c', 'lpv_srv_agb_o', 'rrab', 'rrc', 'rrd', 'rre', 't2cep']

X, Y = [], []
for label, classv in enumerate(classes):
    star_data_cursor.execute('SELECT star_features FROM '+classv)
    for row in star_data_cursor.fetchall()[:600]:
        # Deserializing features.
        features = pickle.loads(row[0])
        X.append(features)
        Y.append([label])
        
names = []
for classv in classes:
    star_data_cursor.execute('SELECT star_name FROM '+classv)
    for row in star_data_cursor.fetchall()[:600]:
        names.append(row[0])

# Close cursor and database    
star_data_cursor.close
star_data_db.close()

print('done')

done


In [3]:
%matplotlib inline
import numpy as np
import matplotlib
matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor

X = np.array(X[9493:])

clf = LocalOutlierFactor(n_neighbors=20)
y_pred = clf.fit_predict(X)
outlier_ind = np.where(y_pred == -1)[0]

print(y_pred)
print('the number of t2ceps is ', len(y_pred), '\n')

print(outlier_ind)
print(" The num of outliers is ", len(outlier_ind), '\n')

print('These are the detected outliers.')
names1 = names[9493:]
for ind in outlier_ind:
    print(names1[ind])


because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



[ 1  1  1  1 -1  1  1 -1  1  1 -1  1  1 -1  1  1  1  1 -1  1  1  1 -1  1  1
  1  1  1  1 -1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1
  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1  1 -1  1  1  1  1  1  1
  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1 -1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1 -1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1 -1  1 -1  1  1 -1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1 -1 -1  1  1  1  1  1 -1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1
  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1 -1  1  1 -1  1  1  1 -1
  1  1  1  1

### Unsupervised Outlier Detection using Local Outlier Factor (LOF)
The LOF score of an observation is equal to the ratio of the average local density of its k-nearest neighbors, and its own local density: a normal instance is expected to have a local density similar to that of its neighbors, while abnormal data are expected to have much smaller local density. LOF can detect both local and global outliers.
<img src="img/lof.png", width=400, height=auto>


In [None]:
%matplotlib inline
import numpy as np
import matplotlib
matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor

np.random.seed(42)


# Generate train data
X = 0.3 * np.random.randn(100, 2)
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X + 2, X - 2, X_outliers]

print(X)
print(len(X))

# fit the model
clf = LocalOutlierFactor(n_neighbors=20)
y_pred = clf.fit_predict(X)
y_pred_outliers = y_pred[200:]

print(y_pred)
print(len(y_pred))

# plot the level sets of the decision function
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Local Outlier Factor (LOF)")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

a = plt.scatter(X[:200, 0], X[:200, 1], c='white',
                edgecolor='k', s=20)
b = plt.scatter(X[200:, 0], X[200:, 1], c='red',
                edgecolor='k', s=20)
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([a, b],
           ["normal observations",
            "abnormal observations"],
           loc="upper left")
plt.show()