In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch import Tensor
from torchsummary import summary
import random
from numpy import load
from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import pickle
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
%run './Attention_based_model.ipynb'


Tue Aug 20 19:44:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off | 00000000:01:00.0  On |                  N/A |
|  0%   46C    P2              57W / 320W |   8247MiB / 16376MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# check the availability of cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device, f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "")

Using device:  cuda (NVIDIA GeForce RTX 4080)


In [3]:
def load_data_make_split(npz_file, train_percentage):
    """
    Load training data (windows + one-hot labels) from compressed file. Split data into train and test set
    Arguments:
        - npz_file: The path to the *.npz file
        - train_percentage: the percentage of data used for training (and not testing), e.g. 0.8
    Returns:
        A 4-tuple of train and test data with labels: (x_train, y_train, x_test, y_test)
    """
    dict_data = load(npz_file)
    x = dict_data['x']
    y = dict_data['y']
    train_length = int(len(x)*train_percentage)
    x_train = x[:train_length]
    y_train = y[:train_length]
    x_test = x[train_length:]
    y_test = y[train_length:]
    return (x_train, y_train, x_test, y_test)

# test load_data_make_split()
x_train, y_train, x_test, y_test = load_data_make_split("update_new_feature_all_days_all_devices.npz", 0.8)
print("shape of train windws: {}".format(x_train.shape))
print("shape of train labels: {}".format(y_train.shape))
print("shape of test windows: {}".format(x_test.shape))
print("shape of test labels: {}".format(y_test.shape))

shape of train windws: (1071459, 200, 8)
shape of train labels: (1071459, 1, 28)
shape of test windows: (267865, 200, 8)
shape of test labels: (267865, 1, 28)


In [4]:
## check number of devices in a dataset
all_labels = []
for i in range(len(y_train)):
  index = np.where(y_train[i][0] == True)
  k = index[0][0]
  all_labels.append(k)

for i in range(len(y_test)):
  index = np.where(y_test[i][0] == True)
  k = index[0][0]
  all_labels.append(k)

unique, count = np.unique(all_labels, return_counts = True)
useable_data = []
data_pair =dict()

# select the devices data points > 10000 as the supervised pretraining data
for i in range(len(unique)):
  if count[i] > 10000:
    useable_data.append(unique[i])
    # print(unique[i], count[i])
    data_pair[unique[i]] = count[i]
sorted_data_by_counts = sorted(data_pair.items(), key=lambda x:x[1])
print(sorted_data_by_counts)


all_device_names_dict = {0:'Smart Things', 1: 'Amazon Echo', 2:'Netatmo Welcome',3:'TP-Link Day Night Cloud camera', 4:'Samsung SmartCam', 5: 'Dropcam', 6: 'Withings Smart Baby Monitor', 7:'Belkin Wemo switch', 8:'TP-Link Smart plug',
                         9: 'iHome', 10:'Belkin wemo motion sensor', 11:'NEST Protect smoke alarm', 12:'Netatmo weather station',13:'Withings Smart scale',14:'Withings Aura smart sleep sensor',15:'Light Bulbs LiFX Smart Bulb',
                         16: 'Triby Speaker', 17:'PIX-STAR Photo-frame', 18 : 'HP Printer', 19: 'Samsung Galaxy Tab', 20: 'Nest Dropcam', 21:'Android Phone', 22:'Laptop', 23:'MacBook', 24:'Android Phone',
                         25: 'IPhone', 26:'MacBook/Iphone', 27:'Insteon Camera'}

[(24, 12041), (15, 12270), (6, 14034), (14, 15738), (2, 34720), (0, 36454), (27, 55804), (7, 61729), (1, 67218), (4, 68597), (10, 102906), (19, 111726), (22, 197876), (23, 228604), (5, 275855)]


In [5]:
seen_idx = [24,15,6,14,2]
unseen_idx = [0]

num_unseen = len(unseen_idx)
num_seen = len(seen_idx)

# def one-hot index
def idxtoOneHot(idx, length):
  label_ohe = np.zeros((1,length))
  label_ohe[0][idx] = 1
  return label_ohe

# def merge dictionary
def Merge(dict1, dict2):
  res = {**dict1, **dict2}
  return res


# def dictionary for unseen, seen, and name
def generate_label_dict(unseen_idx, seed_idx):

  """
    generate the label of each device and its corresponding name

    like {9: 'iHome'}, we shuffle the data, 9 --> 10
    the data will be  {'iHome': 10}

    """
  unseen_dict = dict()
  seen_dict = dict()
  all_index_to_name = dict()
  seen_index_to_name = dict()
  unseen_index_to_name = dict()
  for i in range(0,num_seen):
    seen_dict[seen_idx[i]] = i
    # seen_dict[seen_idx[i]] = 0   ## only 2 modules
    seen_index_to_name[i] = all_device_names_dict[seen_idx[i]]
  for i in range(0,num_unseen):
    unseen_dict[unseen_idx[i]] = num_seen + i
    # unseen_dict[unseen_idx[i]] = 1
    unseen_index_to_name[num_seen+i] = all_device_names_dict[unseen_idx[i]]
  total_dict = Merge(unseen_dict, seen_dict)
  all_index_to_name = Merge(seen_index_to_name, unseen_index_to_name)
  return unseen_dict, seen_dict, total_dict,all_index_to_name

## generate the training data
def seen_training_data(init_x_train, init_x_test, init_y_train, init_y_test, seen_dict):
  x_train_feature = []
  y_train_feature = []
  x_test_feature = []
  y_test_feature = []

  ## change from one-hot to label
  for i in range(len(init_y_train)):
    index = np.where(init_y_train[i][0] == True)
    k = index[0][0]
    if k in seen_idx:
      x_train_feature.append(init_x_train[i])
      # idx to new range
      new_k = seen_dict[k]
      y_train_feature.append(idxtoOneHot(new_k,len(seen_idx)))

  for i in range(len(init_y_test)):
    index = np.where(init_y_test[i][0] == True)
    k = index[0][0]
    if k in seen_idx:
      x_test_feature.append(init_x_test[i])
      # idx to new range
      new_k = seen_dict[k]
      y_test_feature.append(idxtoOneHot(new_k,len(seen_idx)))
  return np.array(x_train_feature), np.array(y_train_feature), np.array(x_test_feature), np.array(y_test_feature)



## extract features for the final test with both seen and unseen data
def feature_extraction_data_seen_and_unseen(init_x_train, init_x_test, init_y_train, init_y_test, attr_dict):

  x_train_attr = []
  y_train_attr = []
  x_test_attr = []
  y_test_attr = []

  attr_idx = [24,15,6,14,2,0]
  for i in range(len(init_y_train)):
    index = np.where(init_y_train[i][0] == True)
    k = index[0][0]
    if k in attr_idx:
      x_train_attr.append(init_x_train[i])
      # idx to new range
      new_k = attr_dict[k]
      y_train_attr.append(idxtoOneHot(new_k,len(attr_idx)))

  for i in range(len(init_y_test)):
    index = np.where(init_y_test[i][0] == True)
    k = index[0][0]
    if k in attr_idx:
      x_test_attr.append(init_x_test[i])
      # idx to new range
      new_k = attr_dict[k]
      y_test_attr.append(idxtoOneHot(new_k,len(attr_idx)))

  x_train_attr = np.array(x_train_attr)
  y_train_attr = np.array(y_train_attr)
  x_test_attr = np.array(x_test_attr)
  y_test_attr = np.array(y_test_attr)

  return x_train_attr,  y_train_attr, x_test_attr, y_test_attr

In [6]:
unseen_dict, seen_dict,total_dict,all_index_to_name = generate_label_dict(unseen_idx, seen_idx)
x_train_feature, y_train_feature, x_test_feature, y_test_feature = seen_training_data(x_train, x_test, y_train, y_test, seen_dict)
x_train_attr,  y_train_attr, x_test_attr, y_test_attr = feature_extraction_data_seen_and_unseen(x_train, x_test, y_train, y_test, total_dict)

print('unseen dictionary', unseen_dict)
print('seen dictionary', seen_dict)
print('total dictionary', total_dict)
print('index to name dictionary', all_index_to_name)

print("shape of train windws: {}".format(x_train_attr.shape))
print("shape of train labels: {}".format(y_train_attr.shape))
print("shape of test windows: {}".format(x_test_attr.shape))
print("shape of test labels: {}".format(y_test_attr.shape))

print("shape of train windws: {}".format(x_train_feature.shape))
print("shape of train labels: {}".format(y_train_feature.shape))
print("shape of test windows: {}".format(x_test_feature.shape))
print("shape of test labels: {}".format(y_test_feature.shape))

unseen dictionary {0: 5}
seen dictionary {24: 0, 15: 1, 6: 2, 14: 3, 2: 4}
total dictionary {0: 5, 24: 0, 15: 1, 6: 2, 14: 3, 2: 4}
index to name dictionary {0: 'Android Phone', 1: 'Light Bulbs LiFX Smart Bulb', 2: 'Withings Smart Baby Monitor', 3: 'Withings Aura smart sleep sensor', 4: 'Netatmo Welcome', 5: 'Smart Things'}
shape of train windws: (100049, 200, 8)
shape of train labels: (100049, 1, 6)
shape of test windows: (25208, 200, 8)
shape of test labels: (25208, 1, 6)
shape of train windws: (70837, 200, 8)
shape of train labels: (70837, 1, 5)
shape of test windows: (17966, 200, 8)
shape of test labels: (17966, 1, 5)


In [7]:
x_train, y_train, x_test, y_test = x_train_attr,  y_train_attr, x_test_attr, y_test_attr

In [8]:
# change label from one-hot to integer
y_train_labels = []
y_test_labels = []
for i in range(len(y_train)):
  index = np.where(y_train[i][0] == True)
  k = index[0][0]
  y_train_labels.append(k)

for i in range(len(y_test)):
  index = np.where(y_test[i][0] == True)
  k = index[0][0]
  y_test_labels.append(k)

In [9]:
def process_dataset(data):
    """
    Process the dataset by applying the following rules:
    1. Cap values of features 6 and 7.
    2. Bin the continuous features 0 and 3 into 1026 bins.
    3. Change any -1 values to 0 for all features.

    Parameters:
    data (numpy.ndarray): The input dataset with shape (n_samples, 200, 8).

    Returns:
    numpy.ndarray: The processed dataset.
    """
    # Rule 1: Cap values for features 6 and 7
    data[:, :, 6] = np.where(data[:, :, 6] > 1024, 1025, data[:, :, 6])
    data[:, :, 7] = np.where(data[:, :, 7] > 1024, 1025, data[:, :, 7])

    # Rule 2: Bin the continuous features at indices 0 and 3
    num_bins = 1026

    # Get the min and max values for each feature to define the bins
    min_val_0, max_val_0 = data[:, :, 0].min(), data[:, :, 0].max()
    min_val_3, max_val_3 = data[:, :, 3].min(), data[:, :, 3].max()

    # Create the bin edges
    bins_0 = np.linspace(min_val_0, max_val_0, num_bins + 1)
    bins_3 = np.linspace(min_val_3, max_val_3, num_bins + 1)

    # Bin the data
    data[:, :, 0] = np.digitize(data[:, :, 0], bins_0) - 1  # Bin and adjust to 0-based indexing
    data[:, :, 3] = np.digitize(data[:, :, 3], bins_3) - 1

    # Convert to category features (ensure integer type)
    data[:, :, 0] = data[:, :, 0].astype(int)
    data[:, :, 3] = data[:, :, 3].astype(int)

    # Rule 3: Change any -1 values to 0 for all features
    data = np.where(data == -1, 0, data)

    return data

# Example usage
# Assuming 'your_dataset' is a numpy array with shape (1071459, 200, 8)
# processed_data = process_dataset(your_dataset)



In [10]:
x_train = process_dataset(x_train)

In [11]:
x_test = process_dataset(x_test)

In [12]:
x_train[1][0:2]

array([[ 55.,   6.,   1.,   0.,   1.,   0.,   0., 443.],
       [ 55.,   6.,   0.,   0.,   0.,   1., 443.,   0.]], dtype=float32)

In [13]:
def flatten_data_and_add_paddings(data):
    # Flatten and reshape
    flattened_data = data.reshape(data.shape[0], -1)  # Shape becomes (1071459, 1600)
    # Step 2: Reshape to add a new dimension at the end
    reshaped_data = flattened_data.reshape(data.shape[0], -1)  # Shape becomes (1071459,  1600)
    new_length = 2000
    # Step 1: Create a new array of zeros with the desired shape
    padded_data = np.zeros((reshaped_data.shape[0], new_length))
    # Step 2: Copy the original data into the new array
    padded_data[:, :reshaped_data.shape[1]] = reshaped_data
    return padded_data

In [14]:
x_new_test = flatten_data_and_add_paddings(x_test)
x_new_train = flatten_data_and_add_paddings(x_train)

In [15]:
y_train_labels = np.array(y_train_labels)
y_test_labels = np.array(y_test_labels)

In [16]:
np.unique(y_train_labels)

array([0, 1, 2, 3, 4, 5])

In [17]:
print("shape of train windws: {}".format(x_new_train.shape))
print("shape of train labels: {}".format(y_train_labels.shape))
print("shape of test windows: {}".format(x_new_test.shape))
print("shape of test labels: {}".format(y_test_labels.shape))

shape of train windws: (100049, 2000)
shape of train labels: (100049,)
shape of test windows: (25208, 2000)
shape of test labels: (25208,)


In [18]:
def generate_flow_packet_embedding(data):
    # Generate the array filled with ones
    ones_array = np.ones(data.shape)
    
    # Print the shape to verify
    print("Generated array shape:", ones_array.shape)
    return ones_array

In [19]:
x_test_segments = np.ones((25208,2000))

In [20]:
x_train_segments = np.ones((100049,2000))

In [21]:
class NetformerDatasetDownstream(Dataset):
    def __init__(self, input_sequences, input_labels, input_segments, seq_len = 2000):
        self.seq_len = seq_len
        self.session_flows = len(input_sequences)
        self.sessions = input_sequences
        self.segments = input_segments
        self.labels = input_labels
        self.special_token_dict =  {'PAD': 0, 'MASK': 1028}
        self.mask_ratio = 0


    def __len__(self):
        return self.session_flows

    def __getitem__(self,item):

        ##step 1 : get random sessions 
        s1, seg1,seq_label = self.get_session_flow(item)

        ## step 2: replace random word in sentence 
        s1_random, s1_label, s1_idx = self.random_word(s1)
        
        segment_label = seg1

        netformer_input = s1_random
        netformer_label = s1_label
        netformer_idx = s1_idx

        
        output = {"netformer_input": netformer_input,
                  "netformer_label": netformer_label,
                  "netformer_idx":netformer_idx,
                  "segment_label": segment_label,
                "sequence_label": seq_label}

        return {key: torch.tensor(value,dtype=torch.float32) for key, value in output.items()}


    def random_word(self, sentence):
        output_label = []
        output = []
        output_idx =[]


        for i, token in enumerate(sentence):
            prob = random.random()

            if prob < self.mask_ratio:
                prob /= self.mask_ratio
    
                if prob < 0.8:
                    output.append(self.special_token_dict['MASK'])
                elif prob < 0.9:
                    output.append(self.random_selection(self.sessions))
                else:
                    output.append(token)
    
                output_label.append(token)
                output_idx.append(1)
    
            else:
                output.append(token)
                output_label.append(0)
                output_idx.append(0)
                

        assert len(output) == len(output_label)
        return output, output_label, output_idx
        

    def random_selection(self, input_sequences):
        rand_session = random.randrange(len(input_sequences))
        rand_flow = random.randrange(len(input_sequences[rand_session]))
        return input_sequences[rand_session][rand_flow]
        

    def get_session_flow(self, item):
        '''Return session data and segments'''
        return self.sessions[item], self.segments[item],self.labels[item]

In [22]:
train_data = NetformerDatasetDownstream(x_new_train,y_train_labels,x_train_segments, seq_len=2000)

In [23]:
train_loader = DataLoader(train_data, batch_size=4, shuffle=True, pin_memory=True)

In [24]:
# sample_data = next(iter(train_loader))

In [25]:
# PositionalEmbedding(10,2000)(sample_data).shape
# a = sample_data['netformer_input']
# b = sample_data['segment_label']
# print(a.shape)
# print(b.shape)
# # Convert b to data type torch.long
# b = b.to(torch.long)
# embeddings = NetformerEmbedding(1030, 10)(a, b)

In [None]:
# device = 'cpu'
NetFormer_model = NetFormer(
  feature_size = 1030,
  d_model=10,
  n_layers=2,
  heads=10,
  dropout=0.1,
number_of_class=6,
).to(device)


# NetFormer_model.to(device)
net_lm = NetFormerLM(NetFormer_model, number_of_class=6).to(device)
# # bert_lm.to(device)
net_trainer = NetformerTrainer(net_lm, train_loader,device = 'cuda')
epochs = 10

for epoch in range(epochs):
  net_trainer.train(epoch)

Total Parameters: 24957


EP_train:0:   0%|| 3/25013 [00:00<51:58,  8.02it/s]  

{'epoch': 0, 'iter': 0, 'loss': 1.6681149005889893}


EP_train:0:  40%|| 10003/25013 [11:33<17:22, 14.40it/s]

{'epoch': 0, 'iter': 10000, 'loss': 0.001857154187746346}


EP_train:0:  50%|| 12545/25013 [14:29<14:21, 14.46it/s]