-
Notifications
You must be signed in to change notification settings - Fork 0
/
Driver.py
81 lines (72 loc) · 3.25 KB
/
Driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import matplotlib
matplotlib.use("Agg")
import pandas as pd
import numpy as np
import os
import sys
from UShapelet import get_ushapelet
from timekeeper import TimeKeeper
from sklearn.metrics import adjusted_rand_score
if __name__ == "__main__":
ROOT_DIR = os.getcwd()
try:
_,data_file,cuda_select = sys.argv
except Exception as e:
print("Usage python3 Driver.py <filename> <use_cuda>")
exit()
# Run with CUDA if selected, otherwise don't.
cuda_flag = True if cuda_select == "1" else False
print("CUDA enable flag = %d" % cuda_flag)
if data_file not in ["Trace.txt","FourClasses.txt"]:
print("Running using custom file. File will be read as a CSV.")
file_path = os.path.join(ROOT_DIR,data_file)
#tk = TimeKeeper()
# The 'Trace.txt' example requires a different parsing method because it is not a CSV file, but space delimited.
if data_file == "Trace.txt":
raw_data = pd.read_table(file_path,delim_whitespace=True,header=None)
labels = raw_data.values[:,0].astype(np.int32)
# Read the file in from CSV
else:
raw_data = pd.read_csv(file_path,header=None)
# Split the data into a label set (the first column of the data),
# and the actual data, (the second -> N columns).
labels = raw_data.values[:,0]
data = raw_data.values[:,1:]
clust_labels = np.zeros(data.shape[0])
clust_num = 1
# In order to ensure that we extract only "good" clusters and do not need to assign every time series to a cluster,
# we keep track of the gap score of the first shapelet we discover. At every iteration after the first, we check
# that the gap score of the current shapelet is larger than 1/2 of the original gap score. If it is not, we stop
# assigning clusters, and assume the rest of the time series are outliers which do not belong to any cluster.
first_pass = True
current_max_score = 0
data_idcs = np.arange(data.shape[0])
#temp_data = data
while data_idcs.shape[0] > 0:
# Extract a U-Shapelet from the data.
idx,score,members,timings = get_ushapelet(data[data_idcs],30,10,tk=None,use_cuda=cuda_flag)
if idx is None:
print("Finished clustering. (No valid shapelets)")
break
elif score > current_max_score:
current_max_score = score
elif score < 0.5*current_max_score:
print("Finished clustering. (Threshold epsilon)")
break
# Assign a cluster label to the members of the new cluster.
clust_labels[data_idcs[members]] = clust_num
# Increment the cluster number by one.
clust_num += 1
# Remove the members of this cluster from the data, and iterate again with the old data.
data_idcs = np.delete(data_idcs,members,axis=0)
print("Runtimes:")
print("Subsequence: %f" % timings[0])
print("SAX: %f" % timings[1])
print("Collision Check: %f" % timings[2])
print("Filter: %f" % timings[3])
print("Gap Score: %f" % timings[4])
print("Search: %f" % timings[5])
print()
print("%d clusters extracted." % np.unique(clust_labels).shape[0])
print("Rand Index (Actual vs. Expected) %f" % adjusted_rand_score(labels,clust_labels))
#tk.print_summary()