In [None]:
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Reshape
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Input
from collections import Counter
from numpy import savez_compressed
from datetime import date
from numpy import load
import numpy as np
import matplotlib.pyplot as plt

# mount Google drive
from google.colab import drive
drive.mount('/content/gdrive')


# google colab file paths
CSV_SEQUENCES = "/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/new_csv_sequences"
NPZ_WINDOWS = "/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/npz_windows"
MODELS = "/content/gdrive/MyDrive/Colab Notebooks/iot_device_classification/models"



# for Google colab: check for gpu
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
# else:
#   print(gpu_info)



# for Google colab: check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Mounted at /content/gdrive
Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
def load_data_make_split(npz_file, train_percentage):
    """
    Load training data (windows + one-hot labels) from compressed file. Split data into train and test set

    Arguments:
        - npz_file: The path to the *.npz file
        - train_percentage: the percentage of data used for training (and not testing), e.g. 0.8
    Returns:
        A 4-tuple of train and test data with labels: (x_train, y_train, x_test, y_test)
    """
    dict_data = load(npz_file)
    x = dict_data['x']
    y = dict_data['y']
    train_length = int(len(x)*train_percentage)
    x_train = x[:train_length]
    y_train = y[:train_length]
    x_test = x[train_length:]
    y_test = y[train_length:]
    return (x_train, y_train, x_test, y_test)

# test load_data_make_split()
x_train, y_train, x_test, y_test = load_data_make_split("{}/update_new_feature_all_days_all_devices.npz".format(NPZ_WINDOWS), 0.8)
print("shape of train windws: {}".format(x_train.shape))
print("shape of train labels: {}".format(y_train.shape))
print("shape of test windows: {}".format(x_test.shape))
print("shape of test labels: {}".format(y_test.shape))

shape of train windws: (1071459, 200, 8)
shape of train labels: (1071459, 1, 28)
shape of test windows: (267865, 200, 8)
shape of test labels: (267865, 1, 28)


In [None]:
all_labels = []
for i in range(len(y_train)):
  index = np.where(y_train[i][0] == True)
  k = index[0][0]
  all_labels.append(k)

for i in range(len(y_test)):
  index = np.where(y_test[i][0] == True)
  k = index[0][0]
  all_labels.append(k)

unique, count = np.unique(all_labels, return_counts = True)
useable_data = []
data_pair =dict()
for i in range(len(unique)):
  if count[i] > 10000:
    useable_data.append(unique[i])
    # print(unique[i], count[i])
    data_pair[unique[i]] = count[i]
sorted_data_by_counts = sorted(data_pair.items(), key=lambda x:x[1])
print(sorted_data_by_counts)

[(24, 12041), (15, 12270), (6, 14034), (14, 15738), (2, 34720), (0, 36454), (27, 55804), (7, 61729), (1, 67218), (4, 68597), (10, 102906), (19, 111726), (22, 197876), (23, 228604), (5, 275855)]


In [None]:
all_device_names_dict = {0:'Smart Things', 1: 'Amazon Echo', 2:'Netatmo Welcome',3:'TP-Link Day Night Cloud camera', 4:'Samsung SmartCam', 5: 'Dropcam', 6: 'Withings Smart Baby Monitor', 7:'Belkin Wemo switch', 8:'TP-Link Smart plug',
                         9: 'iHome', 10:'Belkin wemo motion sensor', 11:'NEST Protect smoke alarm', 12:'Netatmo weather station',13:'Withings Smart scale',14:'Withings Aura smart sleep sensor',15:'Light Bulbs LiFX Smart Bulb',
                         16: 'Triby Speaker', 17:'PIX-STAR Photo-frame', 18 : 'HP Printer', 19: 'Samsung Galaxy Tab', 20: 'Nest Dropcam', 21:'Android Phone', 22:'Laptop', 23:'MacBook', 24:'Android Phone',
                         25: 'IPhone', 26:'MacBook/Iphone', 27:'Insteon Camera'}

In [None]:
import random
total_av_idx = [0, 1, 2, 5, 6, 7, 10, 14, 15, 22, 23, 24]
unseen_idx = random.sample(total_av_idx, 2)
# unseen_idx = [23,2]
seen_idx = [i for i in total_av_idx if i not in unseen_idx]
print(seen_idx)
print(unseen_idx)

[0, 1, 5, 6, 10, 14, 15, 22, 23, 24]
[7, 2]


In [None]:
num_unseen = len(unseen_idx)
num_seen = len(seen_idx)


# def one-hot index
def idxtoOneHot(idx, length):
  label_ohe = np.zeros((1,length))
  label_ohe[0][idx] = 1
  return label_ohe

# def merge
def Merge(dict1, dict2):
  res = {**dict1, **dict2}
  return res


# def dic
def generate_label_dict(unseen_idx, seed_idx):
  unseen_dict = dict()
  seen_dict = dict()
  all_index_to_name = dict()
  seen_index_to_name = dict()
  unseen_index_to_name = dict()
  for i in range(0,num_seen):
    seen_dict[seen_idx[i]] = i
    seen_index_to_name[i] = all_device_names_dict[seen_idx[i]]
  for i in range(0,2):
    unseen_dict[unseen_idx[i]] = num_seen + i
    unseen_index_to_name[num_seen+i] = all_device_names_dict[unseen_idx[i]]
  total_dict = Merge(unseen_dict, seen_dict)
  all_index_to_name = Merge(seen_index_to_name, unseen_index_to_name)
  return unseen_dict, seen_dict, total_dict,all_index_to_name


def seen_training_data_for_LSTM(init_x_train, init_x_test, init_y_train, init_y_test, seen_dict):
  x_train_feature = []
  y_train_feature = []
  x_test_feature = []
  y_test_feature = []

  for i in range(len(init_y_train)):
    index = np.where(init_y_train[i][0] == True)
    k = index[0][0]
    if k in seen_idx:
      x_train_feature.append(init_x_train[i])
      # idx to new range
      new_k = seen_dict[k]
      y_train_feature.append(idxtoOneHot(new_k,len(seen_idx)))

  for i in range(len(init_y_test)):
    index = np.where(init_y_test[i][0] == True)
    k = index[0][0]
    if k in seen_idx:
      x_test_feature.append(init_x_test[i])
      # idx to new range
      new_k = seen_dict[k]
      y_test_feature.append(idxtoOneHot(new_k,len(seen_idx)))
  return np.array(x_train_feature), np.array(y_train_feature), np.array(x_test_feature), np.array(y_test_feature)

def feature_extraction_data_seen_and_unseen(init_x_train, init_x_test, init_y_train, init_y_test, attr_dict):

  x_train_attr = []
  y_train_attr = []
  x_test_attr = []
  y_test_attr = []

  attr_idx = total_av_idx
  for i in range(len(init_y_train)):
    index = np.where(init_y_train[i][0] == True)
    k = index[0][0]
    if k in attr_idx:
      x_train_attr.append(init_x_train[i])
      # idx to new range
      new_k = attr_dict[k]
      y_train_attr.append(idxtoOneHot(new_k,len(attr_idx)))

  for i in range(len(init_y_test)):
    index = np.where(init_y_test[i][0] == True)
    k = index[0][0]
    if k in attr_idx:
      x_test_attr.append(init_x_test[i])
      # idx to new range
      new_k = attr_dict[k]
      y_test_attr.append(idxtoOneHot(new_k,len(attr_idx)))

  x_train_attr = np.array(x_train_attr)
  y_train_attr = np.array(y_train_attr)
  x_test_attr = np.array(x_test_attr)
  y_test_attr = np.array(y_test_attr)

  return x_train_attr,  y_train_attr, x_test_attr, y_test_attr


In [None]:
unseen_dict, seen_dict,total_dict,all_index_to_name = generate_label_dict(unseen_idx, seen_idx)

In [None]:
x_train_feature, y_train_feature, x_test_feature, y_test_feature = seen_training_data_for_LSTM(x_train, x_test, y_train, y_test, seen_dict)
x_train_attr,  y_train_attr, x_test_attr, y_test_attr = feature_extraction_data_seen_and_unseen(x_train, x_test, y_train, y_test, total_dict)

In [None]:
print('unseen dictionary', unseen_dict)
print('seen dictionary', seen_dict)
print('total dictionary', total_dict)
print('index to name dictionary', all_index_to_name)

print("shape of train windws: {}".format(x_train_attr.shape))
print("shape of train labels: {}".format(y_train_attr.shape))
print("shape of test windows: {}".format(x_test_attr.shape))
print("shape of test labels: {}".format(y_test_attr.shape))

print("shape of train windws: {}".format(x_train_feature.shape))
print("shape of train labels: {}".format(y_train_feature.shape))
print("shape of test windows: {}".format(x_test_feature.shape))
print("shape of test labels: {}".format(y_test_feature.shape))

In [None]:
# define model
CLASSES=10
PATIENCE= 10
EPOCHS= 30
BATCH_SIZE = 64
DROPOUT = 0.2
LEARNING_RATE=0.0001
unseen_number = 12 - CLASSES
today = date.today()


model_feature = tf.keras.models.Sequential([
    Input(shape=x_train_feature[0].shape, dtype = tf.float32),
    # Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(1e-4))),
    Bidirectional(LSTM(64, return_sequences=False, kernel_regularizer=l2())),
    # Dense(units=32, activation = 'relu', kernel_regularizer=l2()),
    Dense(units=20, activation = 'relu', kernel_regularizer=l2(), name = 'My_Feature'),
    Dropout(DROPOUT),
    Dense(units=3, activation = 'relu', kernel_regularizer=l2(), name = 'My_Atrribute'),
    Dense(units=CLASSES, activation = 'softmax', kernel_regularizer=l2()),
    Reshape([1, -1]),
])

early_stopping = EarlyStopping(monitor='val_loss', patience=PATIENCE, mode='min')

checkpoint = ModelCheckpoint("{}/deft_LSTM_Kmeans_RF_10-2_test".format(MODELS), monitor='val_loss', verbose=0,
                                    save_best_only=True, mode='min')

model_feature.compile(loss='categorical_crossentropy',
                   optimizer=Adam(learning_rate = LEARNING_RATE),
                   metrics=['accuracy'])



# do training without unseen classes
history = model_feature.fit(x_train_feature, y_train_feature, epochs=EPOCHS,
                    validation_data=(x_test_feature,y_test_feature),
                    callbacks=[early_stopping, checkpoint],
                    batch_size = BATCH_SIZE)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model acc')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
CLASSES=10
PATIENCE= 5
EPOCHS= 20
BATCH_SIZE = 64
DROPOUT = 0.2
LEARNING_RATE=0.001
unseen_number = 12 - CLASSES

model_feature = tf.keras.models.load_model("{}/deft_LSTM_Kmeans_RF_10-2_test".format(MODELS))

In [None]:
# take the layer out
latent_space_model = tf.keras.Model(inputs = model_feature.input, outputs = model_feature.get_layer('My_Feature').output)
attr_model = tf.keras.Model(inputs = model_feature.input, outputs = model_feature.get_layer('My_Atrribute').output)

new_x = np.concatenate((x_train_attr,x_test_attr))
new_y = np.concatenate((y_train_attr,y_test_attr))
print(new_x.shape)
print(new_y.shape)


embedding_set = []
attr_set = []
for i in range(0,new_x.shape[0],1024):
  if i % 100000 == 0:
    print(i)
  if i+1024 <= new_x.shape[0]:
    embedding_set = [*embedding_set, *latent_space_model(new_x[i:i+1024,:,:])]
    attr_set = [*attr_set, *attr_model(new_x[i:i+1024,:,:])]
  else:
    embedding_set = [*embedding_set, *latent_space_model(new_x[i:,:,:])]
    attr_set = [*attr_set, *attr_model(new_x[i:,:,:])]

(1059445, 200, 8)
(1059445, 1, 12)
0


In [None]:
def new_attr(mean, maxi, mini):
  new = []
  for i in range(len(mean)):
    x_scale = 1 * ((mean[i] - mini) / (maxi-mini))
    new.append(np.round(x_scale,2))
  return new

def generate_attr_dict(new_y):
  attr_dict = dict()
  for i in range(12):
    attr_dict[i] = []

  for i in range(len(new_y)):
    index = np.where(new_y[i][0] == True)
    k = index[0][0]
    attr_dict[k].append(attr_set[i])

  attr_lib = dict()
  for key in attr_dict:
    print(key)
    attr_lib[key] = np.mean(attr_dict[key],axis = 0)


  max_for_all = []
  min_for_all = []
  for i in attr_lib.keys():
    k = np.max(attr_lib[i])
    max_for_all.append(k)
    l = np.min(attr_lib[i])
    min_for_all.append(l)


  maxi = np.max(max_for_all)
  mini = np.min(min_for_all)

  # print(maxi)
  # print(mini)

  new_attr_lib = dict()
  for key in attr_lib:
    new_attr_lib[key] = new_attr(attr_lib[key],maxi, mini)

  new_attr_set = []
  new_y_label = []
  for i in range(len(new_y)):
      index = np.where(new_y[i][0] == True)
      k = index[0][0]
      new_y_label.append(k)
      new_attr_set.append(new_attr_lib[k])

  return new_attr_lib,new_y_label,new_attr_set, attr_lib


new_attr_lib, new_y_label, new_attr_set, attr_lib = generate_attr_dict(new_y)


print(new_attr_lib)
print(attr_lib)

In [None]:
seen_index = [0,1,2,3,4,5,6,7,8,9]
unseen_index = [10,11]
print(attr_lib)
seen_num = len(seen_index)
unseen_num = len(unseen_index)
attr_num = 3


def generate_npz_for_seen_and_unseen(seen_idx, unseen_idx, new_y_label, new_attr_set,embedding_set, NPZ_WINDOWS):
  seen_class_label = []
  seen_class_attr = []
  seen_class_features = []

  unseen_class_label = []
  unseen_class_attr  = []
  unseen_class_features  = []

  for i in range(len(new_y_label)):
    if new_y_label[i] in seen_index:
      seen_class_label.append(new_y_label[i])
      seen_class_attr.append(new_attr_set[i])
      seen_class_features.append(embedding_set[i])
    elif new_y_label[i] in unseen_index:
      unseen_class_label.append(new_y_label[i])
      unseen_class_attr.append(new_attr_set[i])
      unseen_class_features.append(embedding_set[i])

  print(Counter(seen_class_label))
  print(Counter(unseen_class_label))

  today = date.today()
  seen_num = len(seen_idx)
  unseen_num = len(unseen_idx)
  attr_num = len(seen_class_attr[0])
  print(seen_num, unseen_num, attr_num)


  npz_file_seen = "{}/Deft_biLSTM_rf_seen_data".format(NPZ_WINDOWS)
  savez_compressed(npz_file_seen, x = seen_class_features, y = seen_class_label, attribute = seen_class_attr)

  npz_file_unseen = "{}/Deft_biLSTM_rf_unseen_data".format(NPZ_WINDOWS)
  savez_compressed(npz_file_unseen, x = unseen_class_features, y = unseen_class_label, attribute = unseen_class_attr)


  return seen_class_features,seen_class_label, seen_class_attr, unseen_class_features, unseen_class_label, unseen_class_attr, npz_file_seen, npz_file_unseen


seen_class_features,seen_class_label, seen_class_attr, unseen_class_features, unseen_class_label, unseen_class_attr, npz_file_seen, npz_file_unseen = generate_npz_for_seen_and_unseen(seen_index, unseen_index, new_y_label, new_attr_set,embedding_set,NPZ_WINDOWS)



{0: array([13.881974, 14.364426, 10.775362], dtype=float32), 1: array([19.818201 ,  1.6539674,  0.6376107], dtype=float32), 2: array([17.585768  ,  0.13264282, 14.816468  ], dtype=float32), 3: array([4.625749  , 0.09583263, 0.6082011 ], dtype=float32), 4: array([ 0.62221414,  6.69359   , 10.333091  ], dtype=float32), 5: array([ 0.9619117 ,  0.62450963, 11.146729  ], dtype=float32), 6: array([4.803446, 4.32539 , 4.328219], dtype=float32), 7: array([0.15496694, 3.4552805 , 0.09663309], dtype=float32), 8: array([ 4.9870687 , 18.414371  ,  0.24464454], dtype=float32), 9: array([12.740985  , 13.265031  ,  0.15779985], dtype=float32), 10: array([9.70206  , 8.3818865, 1.2241544], dtype=float32), 11: array([7.8620157, 8.399073 , 1.7808505], dtype=float32)}
Counter({2: 275855, 8: 197876, 5: 102906, 1: 67218, 4: 61729, 0: 36454, 6: 15738, 3: 14034, 7: 12270, 9: 12041})
Counter({10: 228604, 11: 34720})
10 2 3


In [None]:
# KNN ideas
new_y_train = []
for i in range(len(new_y)):
  index = np.where(new_y[i][0] == True)
  k = index[0][0]
  new_y_train.append(k)


kmean_z_train = []
kmean_y_train = []

data_dict = dict()
for i in range(12):
  data_dict[i] = []

for i in range(len(new_y_train)):
  m = data_dict[new_y_train[i]]
  if len(m) <= 10000:
    m.append(attr_set[i])
    data_dict[new_y_train[i]] = m

for i in range(12):
  for j in range(10000):
    kmean_z_train.append(data_dict[i][j])
    kmean_y_train.append(i)


seen_x = []
seen_y = []
for i in range(10):
  for j in range(2000):
    seen_x.append(data_dict[i][j])
    seen_y.append(i)

In [None]:
seen_x = np.array(seen_x)
seen_y = np.array(seen_y)
print(seen_x.shape)
print(seen_y.shape)

print(np.unique(seen_y))

In [None]:
kmean_z_train = np.array(kmean_z_train)
kmean_y_train = np.array(kmean_y_train)
print(kmean_z_train.shape)
print(kmean_y_train.shape)

(120000, 3)
(120000,)


In [None]:
def data_split(x, y, ratio1, ratio2):

  rng_state = np.random.get_state()
  np.random.shuffle(x)
  np.random.set_state(rng_state)
  np.random.shuffle(y)

  train_length = int(len(x)*ratio1)
  valid_length = int(len(x)*(ratio1+ratio2))
  x_train = x[:train_length]
  y_train = y[:train_length]
  x_valid = x[train_length:valid_length]
  y_valid = y[train_length:valid_length]
  x_test = x[valid_length:]
  y_test = y[valid_length:]

  return (x_train, y_train, x_valid,y_valid, x_test, y_test)

In [None]:
seen_x_train, seen_y_train,seen_x_valid, seen_y_valid,  seen_x_test, seen_y_test = data_split(seen_x,seen_y,0.4,0.4)

deft_x_train, deft_y_train, deft_x_valid, deft_y_valid, deft_x_test, deft_y_test = data_split(kmean_z_train,kmean_y_train,0.4,0.4)
print(deft_x_test.shape)
print(deft_x_valid.shape)

In [None]:
T = dict()
intial_center = []
for i in range(0,12):
  T[i] = np.mean(data_dict[i][0:2000],axis = 0)
  intial_center.append(T[i])
intial_center = np.array(intial_center)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline

In [None]:
from sklearn.cluster import KMeans

## initialize with a initial center
Kmean = KMeans(n_clusters=12, init = intial_center )
Kmean.fit(deft_x_valid)
K_label_list = Kmean.labels_
cluster_dict = dict()
for i in range(len(K_label_list)):
  cluster_dict[K_label_list[i]] = []

for i in range(len(K_label_list)):
  l = cluster_dict[K_label_list[i]]
  l.append(deft_y_valid[i])
  cluster_dict[K_label_list[i]] = l


total_correct = 0
total_data = 0
total_unseen_correct = 0
total_unseen_data = 0
for i in range(12):
  data_pair =dict()
  unique, count = np.unique(cluster_dict[i], return_counts = True)
  for j in range(len(unique)):
    data_pair[unique[j]] = count[j]
    # print(unique[j], count[j])
  max_count = np.max(count)
  rate = max_count / len(cluster_dict[i])
  total_correct += max_count
  total_data += len(cluster_dict[i])
  max_idx = 0
  for keys in data_pair:
    if data_pair[keys] == max_count:
      max_idx = keys
  if max_idx == 10 or max_idx == 11:
    total_unseen_correct += max_count
    total_unseen_data += len(cluster_dict[i])

  print('cluster: {}, label: {}, accuracy: {} '.format(i, max_idx, rate))
  print(data_pair)
  print(rate)

final_acc = total_correct/total_data
final_zsl_acc = total_unseen_correct/total_unseen_data
print('gzsl acc : ', final_acc)
print('zsl acc : ', final_zsl_acc)