# Anlayzing Distance Based Outlier Detection Algorithms
This script runs and analyzes two outlier detection methods; dee-kay-en and one-time sampling.


In [1]:
%matplotlib inline

In [2]:
import kth_NN as dkn# dee-kay-en
import sampling # one-time sampling
from collections import defaultdict
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy

The following script for loading data set and computing tfidf has been borrowed from [here](http://www.markhneedham.com/blog/2015/02/15/pythonscikit-learn-calculating-tfidf-on-how-i-met-your-mother-transcripts/)

In [3]:
episodes = defaultdict(list)
with open("himym", "r") as sentences_file:
    reader = csv.reader(sentences_file, delimiter=',')
    reader.next()
    for row in reader:
        episodes[row[1]].append(row[4])
 
for episode_id, text in episodes.iteritems():
    episodes[episode_id] = "".join(text)
 
corpus = []
for id, episode in sorted(episodes.iteritems(), key=lambda t: int(t[0])):
    corpus.append(episode)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')  
tfidf_matrix =  tf.fit_transform(corpus)
feature_names = tf.get_feature_names() 

tfidf_array = tfidf_matrix.toarray()

In [13]:
rows = tfidf_array.shape[0]
cols = tfidf_array.shape[1]
cols = cols+1 # we use the first column in each row to identify each instance of the data
data = numpy.zeros(shape=(rows,cols))
data[:,0] = range(0,rows) # set ID's for data instances
data[:,1:] = tfidf_array

In [14]:
data2 = data[0:20,:]
k=5
s=10
dummy_types = ['dd']
dummy_headers = ['hed']
n=5

dkn_model = dkn.KthNN(data2,dummy_types,dummy_headers,k,n)
dkn_model.run()




Run dee-kay-en
distance 0.0
distance 1.24158980244
distance 1.26716604892
distance 1.30533784381
distance 1.29172630494
distance 1.28695322534
distance 1.29691225513
distance 1.25353956689
distance 1.3060511663
distance 1.23222644619
distance 1.31348370406
distance 1.28635829956
distance 1.28505508779
distance 1.28301626831
distance 1.31596302398
distance 1.32356263689
distance 1.30836196399
distance 1.28125584138
distance 1.30630112159
distance 1.30343776044
here: 0 1.26716604892
distance 1.24158980244
distance 0.0
distance 1.26381514306
distance 1.30043441382
distance 1.29496758841
distance 1.27531939901
distance 1.29891067928
distance 1.2548000131
distance 1.30614442892
distance 1.226890244
distance 1.3066029572
distance 1.28120495324
distance 1.27796982318
distance 1.28921367234
distance 1.32412973218
distance 1.32211657217
distance 1.31571927521
distance 1.27196162662
distance 1.30188058836
distance 1.30442028104
here: 1 1.26381514306
distance 1.26716604892
distance 1.26381514306


In [8]:
j=0
for d in data2:
    if j==0:        
        i=0
    
        for d2 in data2:
           if i==1:
                su_ = sum((d2[1:] - d[1:]) ** 2)
                total = numpy.sqrt((sum(d2[1:] - d[1:]) ** 2))
                break
           else:
                i=i+1
    else:
        break
print su_        
print total

0.0
0.0


In [12]:
data

array([[   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   1.,    0.,    0., ...,    0.,    0.,    0.],
       [   2.,    0.,    0., ...,    0.,    0.,    0.],
       ..., 
       [ 205.,    0.,    0., ...,    0.,    0.,    0.],
       [ 206.,    0.,    0., ...,    0.,    0.,    0.],
       [ 207.,    0.,    0., ...,    0.,    0.,    0.]])

In [7]:
dkn_time = dkn_model.get_running_time()

In [11]:
s=12
sampling_model = sampling.Sampling_Model(data2,dummy_types,dummy_headers,k,n,s)
sampling_model.run()

Run one time sampling
Finding outliers using one time sampling took 15.1515660286 seconds!


In [14]:
sampling_time = sampling_model.get_running_time()
print dkn_time
print sampling_time

24.2571949959
15.1515629292


In [16]:
dkn_model.get_n_outliers()

[(16, array([ 16.        ,   0.02297698,   0.        , ...,   0.        ,
           0.        ,   0.        ]), 1.3026579993545648),
 (10, array([ 10.,   0.,   0., ...,   0.,   0.,   0.]), 1.3134837040585876),
 (19, array([ 19.,   0.,   0., ...,   0.,   0.,   0.]), 1.3044202810377379),
 (3, array([ 3.,  0.,  0., ...,  0.,  0.,  0.]), 1.3093490717049621),
 (14, array([ 14.,   0.,   0., ...,   0.,   0.,   0.]), 1.3006619736589597)]

In [17]:
sampling_model.get_n_outliers()

[(0, array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), 0.0),
 (1, array([ 1.,  0.,  0., ...,  0.,  0.,  0.]), 0.0),
 (2, array([ 2.,  0.,  0., ...,  0.,  0.,  0.]), 0.0),
 (3, array([ 3.,  0.,  0., ...,  0.,  0.,  0.]), 0.0),
 (4, array([ 4.,  0.,  0., ...,  0.,  0.,  0.]), 0.0)]