### Dependencies

In [32]:
import sqlite3 # Database library.
import os # Folder management library.
import pickle # Serializing module.
import numpy as np # Scientific computing library.
import matplotlib.pyplot as plt # Plotting library.
from mpl_toolkits.mplot3d import Axes3D # 3D plotting tool.
from sklearn.neighbors import LocalOutlierFactor # Outlier Dection library.
from heapq import nsmallest # For finding closest period inlier to outlier

### Retreive Examples from Database

In [33]:
# Initializing database and cursor.
star_data_db = sqlite3.connect('star_data.db')
star_data_cursor = star_data_db.cursor()

# Retrieving star_data from database.
classes = ['cep_1o', 'cep_f', 'dsct','eb_ec', 'eb_ed', 'eb_esd', 'lpv_mira_agb_c', 'lpv_mira_agb_o', 'lpv_osarg_agb',
           'lpv_osarg_rgb', 'lpv_srv_agb_c', 'lpv_srv_agb_o', 'rrab', 'rrc', 'rrd', 'rre', 't2cep']

X, Y = [], []
for label, classv in enumerate(classes):
    temp_X, temp_Y = [], []
    star_data_cursor.execute('SELECT star_features FROM '+classv)
    for row in star_data_cursor.fetchall()[:600]:
        # Deserializing features.
        features = pickle.loads(row[0])
        temp_X.append(features)
        temp_Y.append([label])
    X.append(np.array(temp_X))
    Y.append(np.array(temp_Y))
    
names = []
for classv in classes:
    temp_names = []
    star_data_cursor.execute('SELECT star_name FROM '+classv)
    for row in star_data_cursor.fetchall()[:600]:
        temp_names.append(row[0])
    names.append(temp_names)
        

# Close cursor and database    
star_data_cursor.close
star_data_db.close()

print('done')

done


### Unsupervised Outlier Detection using Local Outlier Factor (LOF)
The LOF score of an observation is equal to the ratio of the average local density of its k-nearest neighbors, and its own local density: a normal instance is expected to have a local density similar to that of its neighbors, while abnormal data are expected to have much smaller local density. LOF can detect both local and global outliers.
<img src="img/lof.png", width=400, height=auto>


### 10 Best Outlier Candidates

In [34]:
# Constructing data.
names_t2cep = names[16]
X_t2cep = X[16]

best_outliers = []

# Recursively applying lof algorithm to obtain 10 best outliers.
for n in range(0,10):
    
    # Fitting the Local Outlier Detection model to the sample data set, and looking for 1 outlier. 
    lof = LocalOutlierFactor(n_neighbors=20, contamination=1/len(X_t2cep))
    outlier_pred = lof.fit_predict(X_t2cep)

    # Find the indices if all outliers.
    outlier_ind = np.where(outlier_pred == -1)[0]
    
    # Find the names of the outliers, and their coordinates.
    for ind in outlier_ind:
        best_outliers.append(names_t2cep[ind])
    
    # Remove outlier from data set.
    X_t2cep = np.delete(X_t2cep, outlier_ind[0], 0)
    names_t2cep = np.delete(names_t2cep, outlier_ind[0], 0)
    
print('The 10 best outliers are: ', best_outliers)



The 10 best outliers are:  ['OGLE-LMC-T2CEP-113.dat', 'OGLE-BLG-T2CEP-177.dat', 'OGLE-LMC-T2CEP-115.dat', 'OGLE-BLG-T2CEP-351.dat', 'OGLE-BLG-T2CEP-345.dat', 'OGLE-BLG-T2CEP-354.dat', 'OGLE-LMC-T2CEP-200.dat', 'OGLE-BLG-T2CEP-350.dat', 'OGLE-BLG-T2CEP-352.dat', 'OGLE-LMC-T2CEP-061.dat']


### Similar Period Inliers to Outliers

In [35]:
feature_names = ['Amplitude', 'AndersonDarling', 'Autocor_length', 'Beyond1Std', 'CAR_mean', 'CAR_sigma', 'CAR_tau', 
                 'Con', 'Eta_e', 'FluxPercentileRatioMid20', 'FluxPercentileRatioMid35', 'FluxPercentileRatioMid50',
                 'FluxPercentileRatioMid65','FluxPercentileRatioMid80', 'Freq1_harmonics_amplitude_0',
                 'Freq1_harmonics_amplitude_1', 'Freq1_harmonics_amplitude_2', 'Freq1_harmonics_amplitude_3',
                 'Freq1_harmonics_rel_phase_0', 'Freq1_harmonics_rel_phase_1', 'Freq1_harmonics_rel_phase_2',
                 'Freq1_harmonics_rel_phase_3', 'Freq2_harmonics_amplitude_0', 'Freq2_harmonics_amplitude_1', 
                 'Freq2_harmonics_amplitude_2', 'Freq2_harmonics_amplitude_3', 'Freq2_harmonics_rel_phase_0',
                 'Freq2_harmonics_rel_phase_1', 'Freq2_harmonics_rel_phase_2', 'Freq2_harmonics_rel_phase_3',
                 'Freq3_harmonics_amplitude_0', 'Freq3_harmonics_amplitude_1', 'Freq3_harmonics_amplitude_2', 
                 'Freq3_harmonics_amplitude_3', 'Freq3_harmonics_rel_phase_0', 'Freq3_harmonics_rel_phase_1',
                 'Freq3_harmonics_rel_phase_2', 'Freq3_harmonics_rel_phase_3', 'Gskew', 'LinearTrend', 'MaxSlope',
                 'Mean', 'Meanvariance', 'MedianAbsDev', 'MedianBRP', 'PairSlopeTrend', 'PercentAmplitude', 
                 'PercentDifferenceFluxPercentile', 'Period_fit', 'PeriodLS', 'Psi_CS', 'Psi_eta', 'Q31', 'Rcs',
                 'Skew', 'SlottedA_length', 'SmallKurtosis', 'Std', 'StetsonK', 'StetsonK_AC', 'StructureFunction_index_21',
                 'StructureFunction_index_31', 'StructureFunction_index_32', 'Colour']

t2cep_periods = X[16][:,feature_names.index('Period_fit')].tolist()
similar_inliers = []

for n in range(0,10):
    
    ind_outlier = names[16].index(best_outliers[n])
    outlier_period = t2cep_periods[ind_outlier]

    # Find 10 inliers that have a similar period to outlier.
    inlier_periods = nsmallest(11, t2cep_periods, key=lambda x: abs(x-outlier_period))
    
    temp_names = []
    for inlier,period in enumerate(inlier_periods):
        ind_inlier = t2cep_periods.index(inlier_periods[inlier])
        inlier_name = names[16][ind_inlier]
        temp_names.append((inlier_name, period))
    
    similar_inliers.append(temp_names)
    
print(max(t2cep_periods))    
from pprint import pprint    
pprint(similar_inliers)

2836.0077299999994
[[('OGLE-LMC-T2CEP-113.dat', 3.0853123206644857),
  ('OGLE-LMC-T2CEP-073.dat', 3.088140338248049),
  ('OGLE-BLG-T2CEP-052.dat', 3.0884557844690974),
  ('OGLE-BLG-T2CEP-316.dat', 3.126295818584072),
  ('OGLE-BLG-T2CEP-141.dat', 3.164786926994907),
  ('OGLE-BLG-T2CEP-099.dat', 2.9913462424347226),
  ('OGLE-BLG-T2CEP-152.dat', 3.1972930245854765),
  ('OGLE-SMC-T2CEP-09.dat', 2.9712227025171627),
  ('OGLE-BLG-T2CEP-115.dat', 3.200648819680078),
  ('OGLE-BLG-T2CEP-070.dat', 2.9358741708542717),
  ('OGLE-LMC-T2CEP-049.dat', 3.23516969893993)],
 [('OGLE-BLG-T2CEP-177.dat', 2836.0077299999994),
  ('OGLE-BLG-T2CEP-345.dat', 1132.0631959999998),
  ('OGLE-BLG-T2CEP-354.dat', 969.626286),
  ('OGLE-LMC-T2CEP-200.dat', 842.58056625),
  ('OGLE-BLG-T2CEP-350.dat', 724.2620452173913),
  ('OGLE-BLG-T2CEP-352.dat', 539.2578881632654),
  ('OGLE-LMC-T2CEP-045.dat', 127.16486839285714),
  ('OGLE-LMC-T2CEP-147.dat', 93.12213764705884),
  ('OGLE-BLG-T2CEP-340.dat', 85.65917557894737),
  ('O

### Comparing Outlier Feature to Mean and Std of the Data

In [36]:
mean_features = np.mean(X[16], axis=0)
std_features = np.std(X[16], axis=0)

upper_bound = np.absolute(mean_features + std_features)
lower_bound = np.absolute(mean_features - std_features)

for n in range(0,10):
    print(best_outliers[n])
    
    ind_outlier = names[16].index(best_outliers[n])
    outlier_features = np.absolute(X[16][ind_outlier])
    
    outlier_upper = np.greater(outlier_features, upper_bound)
    outlier_lower = np.less(outlier_features, lower_bound)
    
    ind_upper = np.where(outlier_upper == True)[0]
    ind_lower = np.where(outlier_lower == True)[0]
    
    print('These features of the outlier are bigger than mean+std')
    for ind in ind_upper:
        ratio = outlier_features[ind]/upper_bound[ind]
        if ratio>2:
            print(feature_names[ind], ratio)
        
    print(' ')
    
    print('These features of the outlier are smaller than mean-std')
    for ind in ind_lower:
        ratio = outlier_features[ind]/lower_bound[ind]
        if ratio<0.1:
            print(feature_names[ind], ratio)
            
    print(' ')
    


OGLE-LMC-T2CEP-113.dat
These features of the outlier are bigger than mean+std
Eta_e 15.5658855723
MaxSlope 6.42506511358
Psi_eta 2.79413244958
 
These features of the outlier are smaller than mean-std
CAR_tau 0.00939386637319
Freq1_harmonics_amplitude_3 0.0345021801956
Freq1_harmonics_rel_phase_3 0.0317674862013
Freq2_harmonics_rel_phase_1 0.0678646868643
Gskew 0.0990501597169
LinearTrend 0.000224568175184
Period_fit 0.026496890858
PeriodLS 4.31168929213e-47
SlottedA_length 0.000714746680317
 
OGLE-BLG-T2CEP-177.dat
These features of the outlier are bigger than mean+std
Amplitude 3.27341722477
Autocor_length 10.6866224696
CAR_tau 4.12212238256
Freq1_harmonics_amplitude_0 3.53885456468
Freq1_harmonics_amplitude_2 2.41324656932
Freq2_harmonics_amplitude_0 3.30740434832
Freq2_harmonics_amplitude_3 3.91021391342
Freq3_harmonics_amplitude_0 3.12821523478
Gskew 6.15996986017
LinearTrend 12.3051501857
Meanvariance 3.43231733058
PercentAmplitude 4.56809005412
PercentDifferenceFluxPercentile 3.