In [20]:
import re
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import json
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report



In [21]:
def preprocess_label(label: str) -> list[str]:
	"""Normalize and split a JSON label into hierarchical components."""
	
	label = label.lower()
	label = re.sub(r'[\./_\-]+', '.', label)
	label = re.sub(r'[^a-z0-9\.]', '', label)
	components = [comp for comp in label.split('.') if comp]
	
	return components

In [22]:
print(preprocess_label("Device..Battery_Voltage-Max"))
print(preprocess_label("user/id"))
print(preprocess_label("  "))

['device', 'battery', 'voltage', 'max']
['user', 'id']
[]


In [23]:
class CharEmbeddingLayer(nn.Module):
	"""Converts characters to embeddings."""
	
	def __init__(self, vocab_size=128, embed_dim=400):
		nn.Module.__init__(self)
		self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)  # vocab size 128 covers ASCII + padding
	
	def forward(self, chars: torch.Tensor):
		return self.embed(chars)  # [batch, seq_len, embed_dim]


In [24]:
class BilstmCnn(nn.Module):
	def __init__(self, embed_dim=400, lstm_hidden=256, cnn_out=256, num_classes=3):
		super().__init__()
		self.char_embed = nn.Embedding(128, embed_dim, padding_idx=0)
		
		self.lstm1 = nn.LSTM(embed_dim, lstm_hidden, bidirectional=True, batch_first=True)
		self.lstm2 = nn.LSTM(lstm_hidden * 2, lstm_hidden, bidirectional=True, batch_first=True)
		self.conv = nn.Conv1d(lstm_hidden * 4, cnn_out, kernel_size=3, padding=1)
		
		self.attention_proj = nn.Linear(cnn_out, cnn_out)
		
		self.fc = nn.Linear(cnn_out, num_classes)
	
	def forward(self, x):
		if x.dim() == 2:
			x = x.unsqueeze(1)
		
		batch_size, num_components, seq_len = x.shape
		
		component_features = []
		for i in range(num_components):
			comp = x[:, i, :]  # [batch, seq_len]
			
			emb = self.char_embed(comp)
			
			h1, _ = self.lstm1(emb)
			h2, _ = self.lstm2(h1)
			h = torch.cat([h1, h2], dim=-1)
			
			h = h.permute(0, 2, 1)
			cnn_out = torch.relu(self.conv(h))
			pooled = torch.max(cnn_out, dim=2)[0]  # [batch, cnn_out]
			
			component_features.append(pooled)
		
		components = torch.stack(component_features, dim=1)
		
		# Attention mechanism
		attn_scores = self.attention_proj(components).squeeze(-1)  # [batch, num_components]
		attn_weights = torch.softmax(attn_scores, dim=1)  # Normalized weights
		aggregated = (components * attn_weights.unsqueeze(-1)).sum(dim=1)  # [batch, cnn_out]
		
		# aggregated = torch.mean(components, dim=1)  # [batch, cnn_out]
		
		return self.fc(aggregated)

In [25]:
model = BilstmCnn()
char_to_idx = {chr(i): i + 2 for i in range(128)}
input_tensor = torch.tensor([[char_to_idx[c] for c in "battery"[:20]]])  # [1, 20]
output = model(input_tensor)

In [26]:
input_tensor

tensor([[100,  99, 118, 118, 103, 116, 123]])

In [27]:
output

tensor([[[ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0.0751, -0.0865,  0.0432],
         [ 0

In [28]:
def label_to_embeddings(label: str, model: BilstmCnn, char_to_idx: dict, max_len=20):
	"""Convert a label to hierarchical embeddings."""
	
	components = preprocess_label(label)
	if not components:
		raise ValueError("Invalid label: empty after preprocessing")
	
	char_indices = []
	for comp in components:
		chars = list(comp)[:max_len]
		indices = [char_to_idx.get(c, 1) for c in chars]  # 1 = unknown token
		indices += [0] * (max_len - len(indices))
		char_indices.append(indices)
	
	char_tensor = torch.tensor(char_indices, dtype=torch.long)
	
	with torch.no_grad():
		component_embeddings = model(char_tensor)
	
	return {
		"components": components,
		"embeddings": component_embeddings,
		"leaf_embedding": component_embeddings[-1]
	}


model = BilstmCnn(embed_dim=400, lstm_hidden=256, cnn_out=256)

char_to_idx = {chr(i): i + 2 for i in range(128)}
char_to_idx.update({'.': 130, '_': 131, '-': 132})

label_embedding = label_to_embeddings(
	label="Device.Battery_Voltage",
	model=model,
	char_to_idx=char_to_idx
)

print(label_embedding["leaf_embedding"].shape)

torch.Size([256, 3])


In [29]:
def match_labels(label1: str, label2: str, model: BilstmCnn, char_to_idx: dict):
	"""Compare two labels hierarchically."""
	
	emb1 = label_to_embeddings(label1, model, char_to_idx)
	emb2 = label_to_embeddings(label2, model, char_to_idx)
	
	# 1. cmp leaf nodes (primary signal)
	leaf_sim = 1 - cosine_similarity(
		emb1["leaf_embedding"].numpy().reshape(1, -1),
		emb2["leaf_embedding"].numpy().reshape(1, -1)
	)[0][0]
	
	# 2. cmp parent nodes (context)
	parent_sim = 0.0
	min_depth = min(len(emb1["components"]), len(emb2["components"]))
	for i in range(min_depth - 1):  # Exclude leaf
		parent_sim += 1 - cosine_similarity(
			emb1["embeddings"][i].numpy().reshape(1, -1),
			emb2["embeddings"][i].numpy().reshape(1, -1)
		)[0][0]
	
	if min_depth > 1:
		parent_sim /= (min_depth - 1)
	
	return 0.7 * leaf_sim + 0.3 * parent_sim  # tune weights

In [30]:
class LabelDataset(Dataset):
	def __init__(self, json_path, char_to_idx, max_len=20):
		with open(json_path) as f:
			data = json.load(f)
		
		self.samples = []
		
		self.groups = {}
		self.char_to_idx = char_to_idx
		self.max_len = max_len
		
		self.label_to_group = {}
		group_ids = set()
		
		for item in data:
			if 'groups' in item:
				for group in item['groups']:
					group_id = group['id']
					group_ids.add(group_id)
					self.groups[group_id] = group['name']
			
			if 'labels' in item:
				for label in item['labels']:
					self.label_to_group[label['id']] = label['group_id']
					for sample in label['samples']:
						self.samples.append({
							'text': sample,
							'label_id': label['id'],
							'group_id': label['group_id']
						})
		
		self.group_to_idx = {gid: idx for idx, gid in enumerate(group_ids)}
	
	def __len__(self):
		return len(self.samples)
	
	def __getitem__(self, idx):
		sample = self.samples[idx]
		components = preprocess_label(sample['text'])
		
		char_indices = []
		for comp in components:
			chars = list(comp)[:self.max_len]
			indices = [self.char_to_idx.get(c, 1) for c in chars]  # 1=UNK
			indices += [0] * (self.max_len - len(indices))  # 0=PAD
			char_indices.append(indices)
		
		max_components = max(len(preprocess_label(s['text'])) for s in self.samples)
		while len(char_indices) < max_components:
			char_indices.append([0] * self.max_len)
		
		return {
			'char_indices': torch.tensor(char_indices, dtype=torch.long),
			'label_id': torch.tensor(sample['label_id'], dtype=torch.long),
			'group_idx': torch.tensor(self.group_to_idx[sample['group_id']], dtype=torch.long),
			'text': sample['text']
		}

In [31]:
def collate_fn(batch):
	return {
		'char_indices': torch.stack([item['char_indices'] for item in batch]),
		'label_ids': torch.stack([item['label_id'] for item in batch]),
		'group_indices': torch.stack([item['group_idx'] for item in batch]),
		'texts': [item['text'] for item in batch]
	}

In [32]:
dataset = LabelDataset('labels.json', char_to_idx)
train_loader = DataLoader(
	LabelDataset('labels.json', char_to_idx),
	batch_size=32,
	shuffle=True,
	collate_fn=collate_fn
)

In [33]:
num_classes = len(set(dataset.group_to_idx.values()))
print(f"Number of classes: {num_classes}")

Number of classes: 43


In [34]:
# init model and character mapping
model = BilstmCnn(embed_dim=400, lstm_hidden=256, cnn_out=256, num_classes=num_classes)
char_to_idx = {chr(i): i + 2 for i in range(128)}
char_to_idx.update({'.': 130, '_': 131, '-': 132})

similarity = match_labels(
	label1="sensor.voltage",
	label2="device.batt_v",
	model=model,
	char_to_idx=char_to_idx
)

print(f"Similarity: {similarity:.4f}")

Similarity: 0.0831


### Training Setup

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

### Training Loop

In [36]:
for epoch in range(15):
	model.train()
	total_loss = 0
	
	for batch in train_loader:
		inputs = batch['char_indices'].to(device)  # [batch, num_components, seq_len]
		labels = batch['group_indices'].to(device)
		
		optimizer.zero_grad()
		outputs = model(inputs)
		loss = criterion(outputs, labels)
		loss.backward()
		optimizer.step()
		
		total_loss += loss.item()
	
	print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}')


RuntimeError: The size of tensor a (4) must match the size of tensor b (256) at non-singleton dimension 2

In [18]:
model.eval()
all_preds = []
all_labels = []
all_texts = []

with torch.no_grad():
	for batch in train_loader:
		inputs = batch['char_indices'].to(device)
		labels = batch['group_indices'].cpu().numpy()
		texts = batch['texts']
		
		outputs = model(inputs)
		preds = torch.argmax(outputs, dim=1).cpu().numpy()
		
		all_preds.extend(preds)
		all_labels.extend(labels)
		all_texts.extend(texts)

unique_labels = sorted(set(all_labels))
num_classes_eval = len(unique_labels)
target_names = [f"Group {i}" for i in unique_labels]

print(classification_report(all_labels, all_preds, target_names=target_names))

print("\nSample predictions:")
for text, pred, true in zip(all_texts[:5], all_preds[:5], all_labels[:5]):
	print(f"Text: {text:<30} | Predicted: {pred} | True: {true}")

              precision    recall  f1-score   support

     Group 0       1.00      1.00      1.00        27
     Group 1       1.00      1.00      1.00        59
     Group 2       1.00      1.00      1.00         9
     Group 3       1.00      1.00      1.00        13
     Group 4       1.00      1.00      1.00        10
     Group 5       1.00      1.00      1.00         9
     Group 6       1.00      1.00      1.00         3
     Group 7       1.00      1.00      1.00         5
     Group 8       1.00      0.83      0.91         6
     Group 9       1.00      1.00      1.00         1
    Group 10       1.00      1.00      1.00        10
    Group 11       1.00      1.00      1.00        27
    Group 12       1.00      1.00      1.00         6
    Group 13       1.00      1.00      1.00         6
    Group 14       1.00      1.00      1.00         2
    Group 15       1.00      1.00      1.00         1
    Group 16       1.00      1.00      1.00         5
    Group 17       1.00    

In [19]:
def classify_json_labels(json_data, model, char_to_idx):
	# extract all potential labels from json
	labels = set()
	
	def extract_strings(data):
		if isinstance(data, dict):
			for key, value in data.items():
				labels.add(key)
				extract_strings(value)
		elif isinstance(data, list):
			for item in data:
				extract_strings(item)
		elif isinstance(data, str):
			labels.add(data)
	
	extract_strings(json_data)
	
	results = []
	for label in labels:
		try:
			if not isinstance(label, str) or not label.strip():
				continue
			
			with torch.no_grad():
				# label to proper input format
				components = preprocess_label(label)
				if not components:
					continue
				
				# character indices tensor with correct type
				char_indices = []
				for comp in components:
					chars = list(comp)[:20]  # max_len=20
					indices = [char_to_idx.get(c, 1) for c in chars] + [0] * (20 - len(chars))
					char_indices.append(indices)
				
				inputs = torch.tensor(char_indices, dtype=torch.long).unsqueeze(0)  # add batch dim
				inputs = inputs.to(device)
				
				outputs = model(inputs)
				pred = torch.argmax(outputs, dim=1).item()
			
			results.append({
				'original_label': label,
				'predicted_group': pred,
				'components': components
			})
		except Exception as e:
			print(f"Error processing label '{label}': {str(e)}")
	
	return results


with open('adeunis--arf8123aa.json') as f:
	json_data = json.load(f)

classification_results = classify_json_labels(json_data, model, char_to_idx)

print("Label Classification Results:")
print("-" * 50)
for result in classification_results:
	print(f"Original: {result['original_label']}")
	print(f"Components: {result['components']}")
	print(f"Predicted Group: {result['predicted_group']} => {dataset.groups[result['predicted_group']]}")
	print("-" * 50)

Label Classification Results:
--------------------------------------------------
Original: lora
Components: ['lora']
Predicted Group: 25 => LoRaWAN
--------------------------------------------------
Original: EU868
Components: ['eu868']
Predicted Group: 11 => humidity_status
--------------------------------------------------
Original: East
Components: ['east']
Predicted Group: 19 => location
--------------------------------------------------
Original: sats
Components: ['sats']
Predicted Group: 19 => location
--------------------------------------------------
Original: location
Components: ['location']
Predicted Group: 19 => location
--------------------------------------------------
Original: deviceProfileId
Components: ['deviceprofileid']
Predicted Group: 27 => modification
--------------------------------------------------
Original: 1886c85a-96bb-4a7f-948d-a6fe8fc9ec1a
Components: ['1886c85a', '96bb', '4a7f', '948d', 'a6fe8fc9ec1a']
Predicted Group: 14 => pm2.5
----------------------