In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import joblib
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.amp import autocast, GradScaler
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# Enable cuDNN benchmarking for faster training
torch.backends.cudnn.benchmark = True

In [3]:
# 1. Load dataset and remove rows with '-' in the tag column
df = pd.read_csv("figma_dataset.csv")

df = df[~df['tag'].str.contains(r'[-:]', regex=True)]
df = df[~df['tag'].str.contains(r'\b(CNX|ADDRESS|ASIDE|CANVAS|CITE|DD|DL|DT|ICON|S|VECTOR|DEL|LEGEND|BDI|LOGO|OBJECT|OPTGROUP|CENTER|CODE|BLOCKQUOTE|FRONT|Q|IFRAME|A|HR|SEARCH|DETAILS|FIELDSET|SLOT|SVG|AD|ADSLOT|AUDIO|BLINK|BOLD|COL|COLGROUP|COMMENTS|DATA|DIALOG|EMBED|EMPHASIS|FONT|H7|HGROUP|INS|INTERACTION|ITALIC|ITEMTEMPLATE|MARK|MATH|MENU|MI|MN|MO|MROW|MSUP|NOBR|OFFER|OPTION|PATH|PROGRESS|STRIKE|SWAL|TEXT|TFOOT|TITLE|TT|VAR|VEV|W|WBR|COUNTRY|ESI:INCLUDE|HTTPS:|LOGIN|NOCSRIPT|PERSONAL|STONG|CONTENT|DELIVERY|LEFT|MSUBSUP|KBD|ROOT|PARAGRAPH|BE|AI2SVELTEWRAP|BANNER|PHOTO1)\b', regex=True)]

# Define the regex pattern for matching
pattern = r'[-:]|\b(CNX|ADDRESS|ASIDE|CANVAS|CITE|DD|DL|DT|ICON|S|VECTOR|DEL|LEGEND|BDI|LOGO|OBJECT|OPTGROUP|CENTER|CODE|BLOCKQUOTE|FRONT|Q|IFRAME|SEARCH|DETAILS|FIELDSET|SLOT|AD|ADSLOT|AUDIO|BLINK|BOLD|COL|COLGROUP|COMMENTS|DATA|DIALOG|EMBED|EMPHASIS|FONT|H7|HGROUP|INS|INTERACTION|ITALIC|ITEMTEMPLATE|MARK|MATH|MENU|MI|MN|MO|MROW|MSUP|NOBR|OFFER|OPTION|PATH|PROGRESS|STRIKE|SWAL|TEXT|TFOOT|TITLE|TT|VAR|VEV|W|WBR|COUNTRY|ESI:INCLUDE|HTTPS:|LOGIN|NOCSRIPT|PERSONAL|STONG|CONTENT|DELIVERY|LEFT|MSUBSUP|KBD|ROOT|PARAGRAPH|BE|AI2SVELTEWRAP|BANNER|PHOTO1)\b'

# Apply the replacement conditionally
for col in ['prev_sibling_html_tag', 'child_1_html_tag', 'child_2_html_tag']:
    df[col] = np.where(df[col].str.contains(pattern, regex=True, na=False), 'DIV', df[col])

# Define mapping for tag replacements
tag_mapping = {
    "ARTICLE": "DIV", "DIV": "DIV", "FIGURE": "DIV", "FOOTER": "DIV", "HEADER": "DIV", "NAV": "DIV", "MAIN": "DIV",
    "BODY" : "DIV", "FORM" : "DIV", "OL" : "DIV", "UL" : "DIV", "TABLE": "DIV", "THEAD":"DIV" , "TBODY": "DIV", "SECTION" : "DIV",
    "H1": "P", "H2": "P", "H3": "P", "H4": "P", "H5": "P", "H6": "P","SUP": "P","SUB": "P", "BIG": "P",
    "P": "P", "CAPTION": "P", "FIGCAPTION": "P", "B": "P", "EM": "P", "I": "P", "TD": "P", "TH": "P", "TR": "P","PRE":"P",
    "U": "P", "TIME": "P", "TXT": "P", "ABBR": "P","SMALL": "P","STRONG": "P","SUMMARY": "P","SPAN": "P", "LABEL": "P","LI":"P",
    "PICTURE": "IMG" , "VIDEO": "IMG",
    "SELECT": "INPUT","TEXTAREA": "INPUT",
    "VECTOR": "SVG"
}

# df.loc[(df["tag"] == "LABEL") & ((df["type"] == "RECTANGLE") | (df["type"] == "GROUP")), "tag"] = "DIV"
df.loc[(df["tag"] == "SPAN") & ((df["type"] == "RECTANGLE") | (df["type"] == "GROUP")), "tag"] = "DIV"

# Replace any value in children tag columns that contains '-' with 'DIV'
children_cols = ['child_1_html_tag', 'child_2_html_tag']
for col in children_cols:
    df[col] = df[col].apply(lambda x: "DIV" if isinstance(x, str) and '-' in x else x)

# Convert tag and parent_tag_html columns to uppercase
df['tag'] = df['tag'].str.upper()
df['prev_sibling_html_tag'] = df['prev_sibling_html_tag'].str.upper()
df['child_1_html_tag'] = df['child_1_html_tag'].str.upper()
df['child_2_html_tag'] = df['child_2_html_tag'].str.upper()

# Apply mapping to 'tag' and 'parent_tag_html' columns
df['tag'] = df['tag'].replace(tag_mapping)
df['prev_sibling_html_tag'] = df['prev_sibling_html_tag'].replace(tag_mapping)
df['child_1_html_tag'] = df['child_1_html_tag'].replace(tag_mapping)
df['child_2_html_tag'] = df['child_2_html_tag'].replace(tag_mapping)



df = df[~df['tag'].str.contains(r'\b(P|IMG)\b', regex=True)]

# Print remaining unique tags
print(df['tag'].unique())

  df = df[~df['tag'].str.contains(r'\b(CNX|ADDRESS|ASIDE|CANVAS|CITE|DD|DL|DT|ICON|S|VECTOR|DEL|LEGEND|BDI|LOGO|OBJECT|OPTGROUP|CENTER|CODE|BLOCKQUOTE|FRONT|Q|IFRAME|A|HR|SEARCH|DETAILS|FIELDSET|SLOT|SVG|AD|ADSLOT|AUDIO|BLINK|BOLD|COL|COLGROUP|COMMENTS|DATA|DIALOG|EMBED|EMPHASIS|FONT|H7|HGROUP|INS|INTERACTION|ITALIC|ITEMTEMPLATE|MARK|MATH|MENU|MI|MN|MO|MROW|MSUP|NOBR|OFFER|OPTION|PATH|PROGRESS|STRIKE|SWAL|TEXT|TFOOT|TITLE|TT|VAR|VEV|W|WBR|COUNTRY|ESI:INCLUDE|HTTPS:|LOGIN|NOCSRIPT|PERSONAL|STONG|CONTENT|DELIVERY|LEFT|MSUBSUP|KBD|ROOT|PARAGRAPH|BE|AI2SVELTEWRAP|BANNER|PHOTO1)\b', regex=True)]
  df[col] = np.where(df[col].str.contains(pattern, regex=True, na=False), 'DIV', df[col])


['DIV' 'BUTTON' 'INPUT']


  df = df[~df['tag'].str.contains(r'\b(P|IMG)\b', regex=True)]


In [4]:
# 2. Separate features and target
y = df["tag"]
X = df.drop(columns=["tag"])

In [5]:
# 3. Identify categorical and continuous columns
categorical_cols = ['type','prev_sibling_html_tag','child_1_html_tag','child_2_html_tag']
continuous_cols = [col for col in X.columns if col not in categorical_cols]

In [6]:
# Process categorical features with OneHotEncoder instead of LabelEncoder
X[categorical_cols] = X[categorical_cols].astype(str).fillna('unknown')
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat_encoded = ohe.fit_transform(X[categorical_cols])
joblib.dump(ohe, "ohe_encoder.pkl")

['ohe_encoder.pkl']

In [7]:
# Better missing value handling with imputer
imputer = SimpleImputer(strategy='most_frequent')
X_continuous_imputed = imputer.fit_transform(X[continuous_cols])
joblib.dump(imputer, "imputer.pkl")

['imputer.pkl']

In [8]:
# Scale continuous features
scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X_continuous_imputed)
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [9]:
# Combine one-hot encoded categorical features with scaled continuous features
X_processed = np.concatenate([X_cat_encoded, X_continuous_scaled], axis=1)

In [10]:
# 4. Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [11]:
from collections import Counter

# Count occurrences of each class
class_counts = Counter(y_encoded)

# Find classes with only 1 sample
rare_classes = [cls for cls, count in class_counts.items() if count < 2]

# Duplicate rare class samples
for cls in rare_classes:
    idx = np.where(y_encoded == cls)[0][0]  # Get the index of the rare sample
    original_class_name = label_encoder.inverse_transform([cls])[0]  # Convert back to original label
    print(f"Duplicating class '{original_class_name}' (only 1 sample present).")

    X_processed = np.vstack([X_processed, X_processed[idx]])  # Duplicate features
    y_encoded = np.append(y_encoded, y_encoded[idx])  # Duplicate label

In [12]:
# 5. Train/test split - remove stratification if there are classes with too few samples
unique_counts = np.unique(y_encoded, return_counts=True)
min_samples = min(unique_counts[1])

if min_samples < 2:
    print(f"Warning: The least populated class has only {min_samples} sample(s). Removing stratification.")
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y_encoded, test_size=0.2, random_state=42
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

In [13]:
# Move GPU setup earlier
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [14]:
# Convert data to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [15]:
# Create dataset and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(
    train_dataset, 
    batch_size=512,  # Larger batch size
    shuffle=True, 
    num_workers=8,   # Parallel loading
    pin_memory=True,  # Faster data transfer to GPU
    prefetch_factor=2
)

In [16]:
# Compute class weights
print("Computing class weights...")
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

Computing class weights...


In [17]:
# 6. Define improved model architecture with proper input size
class ImprovedTagClassifier(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate=0.05):
        super(ImprovedTagClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.dropout(self.relu(self.bn3(self.fc3(x))))
        logits = self.fc4(x)
        return logits

In [18]:
# Initialize model
print("Initializing model...")
input_size = X_train.shape[1]
output_size = len(label_encoder.classes_)
model = ImprovedTagClassifier(input_size, output_size).to(device)

Initializing model...


In [19]:
# 7. Define loss function and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-6)

In [20]:
# 8. Setup mixed precision training
scaler = GradScaler(device='cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
# 9. Training loop with timing and early stopping
print("Starting training...")
best_loss = float('inf')
patience = 10
counter = 0
early_stop = False
start_time = time.time()

num_epochs = 200
for epoch in range(num_epochs):
    epoch_start = time.time()
    model.train()
    epoch_loss = 0
    
    for batch_X, batch_y in train_loader:
        # Move batch to device
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        
        # Use mixed precision for faster training
        with torch.amp.autocast('cuda', enabled=device.type=='cuda'):
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
        
        # Scale gradients and optimize
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    scheduler.step()
    
    epoch_time = time.time() - epoch_start
    if (epoch + 1) % 5 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s")
    
    # Early stopping
    if avg_loss < best_loss:
        best_loss = avg_loss
        counter = 0
        torch.save(model.state_dict(), "best_tag_classifier.pth")
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            early_stop = True
    
    if early_stop:
        break

Starting training...
Epoch [5/200], Loss: 0.0861, Time: 5.03s
Epoch [10/200], Loss: 0.0644, Time: 5.30s
Epoch [15/200], Loss: 0.0574, Time: 6.75s
Epoch [20/200], Loss: 0.0483, Time: 4.90s
Epoch [25/200], Loss: 0.0419, Time: 6.85s
Epoch [30/200], Loss: 0.0343, Time: 4.91s
Epoch [35/200], Loss: 0.0316, Time: 4.82s
Epoch [40/200], Loss: 0.0239, Time: 4.91s
Epoch [45/200], Loss: 0.0207, Time: 6.01s
Epoch [50/200], Loss: 0.0218, Time: 4.83s
Epoch [55/200], Loss: 0.0198, Time: 6.13s
Epoch [60/200], Loss: 0.0235, Time: 5.37s
Epoch [65/200], Loss: 0.0256, Time: 5.21s
Early stopping at epoch 65


In [22]:
# Save the trained model
torch.save(model.state_dict(), "tag_classifier.pth")
total_time = time.time() - start_time
print(f"Total training time: {total_time:.2f} seconds")

Total training time: 339.48 seconds


In [23]:
# 10. Evaluation on the test set
print("Evaluating model...")
model.load_state_dict(torch.load("best_tag_classifier.pth"))  # Load best model
model.eval()

# Process test data in batches for memory efficiency
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=256)

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        _, predictions = torch.max(outputs, 1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

y_pred = np.array(all_predictions)
y_test_np = np.array(all_labels)

accuracy = accuracy_score(y_test_np, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

Evaluating model...


  model.load_state_dict(torch.load("best_tag_classifier.pth"))  # Load best model



Accuracy: 0.9935


In [24]:
print("\nClassification Report:")
print(classification_report(
    y_test_np, 
    y_pred,
    labels=np.unique(y_test_np),
    target_names=label_encoder.inverse_transform(np.unique(y_test_np))
))


Classification Report:
              precision    recall  f1-score   support

      BUTTON       0.84      0.98      0.90       569
         DIV       1.00      0.99      1.00     23019
       INPUT       0.77      0.97      0.86       148

    accuracy                           0.99     23736
   macro avg       0.87      0.98      0.92     23736
weighted avg       0.99      0.99      0.99     23736



In [25]:
# Print feature importances from the model
print("\nAnalyzing feature importance...")
with torch.no_grad():
    weights = model.fc1.weight.cpu().numpy()
    importance = np.abs(weights).mean(axis=0)
    
    # Get feature names (both categorical encoded and continuous)
    cat_feature_names = ohe.get_feature_names_out(categorical_cols)
    all_feature_names = np.concatenate([cat_feature_names, np.array(continuous_cols)])
    
    feature_importance = list(zip(all_feature_names, importance))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    
    print("\nTop 100 most important features:")
    for feature, imp in feature_importance[:100]:
        print(f"{feature}: {imp:.4f}")


Analyzing feature importance...

Top 100 most important features:
width: 0.1449
aspect_ratio: 0.1228
prev_sibling_html_tag_BUTTON: 0.1227
child_2_html_tag_SVG: 0.1197
child_1_html_tag_SVG: 0.1175
height: 0.1169
prev_sibling_html_tag_A: 0.1165
sibling_count: 0.1152
child_2_html_tag_INPUT: 0.1117
num_children_to_end: 0.1089
child_2_html_tag_P: 0.1083
type_ELLIPSE: 0.1080
child_1_html_tag_IMG: 0.1071
chars_count_to_end: 0.1069
prev_sibling_html_tag_INPUT: 0.1041
prev_sibling_html_tag_IMG: 0.1039
prev_sibling_html_tag_DIV: 0.1011
child_2_html_tag_DIV: 0.1009
child_1_html_tag_A: 0.1005
child_1_html_tag_INPUT: 0.1003
prev_sibling_html_tag_P: 0.1003
child_2_html_tag_A: 0.1000
child_1_html_tag_BUTTON: 0.0993
child_2_html_tag_nan: 0.0978
child_2_html_tag_IMG: 0.0968
child_1_html_tag_HR: 0.0956
child_1_html_tag_nan: 0.0934
child_1_html_tag_P: 0.0923
prev_sibling_html_tag_SVG: 0.0919
child_2_html_tag_BUTTON: 0.0911
child_1_html_tag_DIV: 0.0907
prev_sibling_html_tag_nan: 0.0867
type_FRAME: 0.0825