In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import joblib
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.amp import autocast, GradScaler
from torch.utils.data import TensorDataset, DataLoader
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Enable cuDNN benchmarking for faster training
torch.backends.cudnn.benchmark = True

In [3]:
# 1. Load dataset and remove rows with '-' in the tag column
df = pd.read_csv("../../feature_extraction/figma_dataset.csv")
df = df[~df['tag'].str.contains('-')]

unique_tags = df['tag'].unique().tolist()

# Define tag replacement rules
txt_tags = ['B', 'CAPTION', 'EM', 'FIGCAPTION', 'I', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 
            'LABEL', 'LI', 'TIME', 'TD', 'TH', 'U', 'P',  'SPAN', 'A', 'TXT', 'SMALL', 'ADDRESS', 'STRONG',
            'SUMMARY', 'SUP']

div_tags = ['ARTICLE', 'FIGURE', 'FOOTER', 'HEADER', 'MAIN', 'NAV', 'OL', 'UL', 'FORM', 'DETAILS', 'SECTION']


button_tags = ['SELECT']

# Apply replacements
df['tag'] = df['tag'].apply(
    lambda x: 'TEXT' if x in txt_tags else 
              'DIV' if x in div_tags else 
              'BUTTON' if x in div_tags else 
              x
)

allowed_tags = ["DIV", "BUTTON", "INPUT"]
df = df[df["tag"].isin(allowed_tags)]
df = df.reset_index(drop=True)


columns_to_drop = [
    'type', 'num_children', 'parent_tag', 'parent_tag_html', 'prev_sibling_tag', 'parent_prev_sibling_tag', 
    'sibling_count', 'has_background_color', 'text_length', 'nearest_text_distance', 'nearest_image_distance']

# Safely drop columns that exist
for col in columns_to_drop:
    if col in df.columns:
        df = df.drop(columns=[col])

# Save the modified dataframe to a new Excel file
df.to_csv('cleaned_data.csv', index=False)

In [4]:

# 4. Generate embeddings

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  
text_contents = df['nearest_text_content'].fillna('').tolist()


print("Generating embeddings for nearest_text_content...")
text_embeddings = sentence_model.encode(text_contents, show_progress_bar=True)
print(f"Embedding shape: {text_embeddings.shape}")  # Should be (n_samples, embedding_dim)

# 5. Create column names for the embeddings
embedding_dim = text_embeddings.shape[1]
embedding_cols = [f'text_embedding_{i}' for i in range(embedding_dim)]

# 6. Add embeddings to the dataframe
embedding_df = pd.DataFrame(text_embeddings, columns=embedding_cols)
df_with_embeddings = pd.concat([df, embedding_df], axis=1)

# 7. Drop the original text content column since we now have embeddings
df_with_embeddings = df_with_embeddings.drop(columns=['nearest_text_content'])

# 8. Optionally, handle the nearest_image_size column
# If it's not numeric, you might want to process it separately or drop it
if 'nearest_image_size' in df_with_embeddings.columns and df_with_embeddings['nearest_image_size'].dtype == 'object':
    # If it contains dimensions like '324x240', extract features
    try:
        # Try to convert to numeric directly
        df_with_embeddings['nearest_image_size'] = pd.to_numeric(df_with_embeddings['nearest_image_size'])
    except:
        # If that fails, it might be in the format of dimensions, so drop it or process it
        df_with_embeddings = df_with_embeddings.drop(columns=['nearest_image_size'])

# 9. Save the dataframe with embeddings
df = df_with_embeddings

# 10. Now continue with model training as before

Generating embeddings for nearest_text_content...


Batches: 100%|██████████| 1920/1920 [00:23<00:00, 82.81it/s] 


Embedding shape: (61420, 384)


In [5]:
# 2. Separate features and target
y = df["tag"]
X = df.drop(columns=["tag"])

In [6]:
# 3. Identify categorical and continuous columns
categorical_cols = []
continuous_cols = [col for col in X.columns if col not in categorical_cols]

In [7]:
# Process categorical features with OneHotEncoder instead of LabelEncoder
X[categorical_cols] = X[categorical_cols].astype(str).fillna('unknown')
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat_encoded = ohe.fit_transform(X[categorical_cols])
joblib.dump(ohe, "ohe_encoder.pkl")

['ohe_encoder.pkl']

In [8]:
# Better missing value handling with imputer
imputer = SimpleImputer(strategy='median')
X_continuous_imputed = imputer.fit_transform(X[continuous_cols])
joblib.dump(imputer, "imputer.pkl")

['imputer.pkl']

In [9]:
# Scale continuous features
scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X_continuous_imputed)
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [10]:
# Combine one-hot encoded categorical features with scaled continuous features
X_processed = np.concatenate([X_cat_encoded, X_continuous_scaled], axis=1)

In [11]:
# 4. Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [12]:
from collections import Counter

# Count occurrences of each class
class_counts = Counter(y_encoded)

# Find classes with only 1 sample
rare_classes = [cls for cls, count in class_counts.items() if count < 2]

# Duplicate rare class samples
for cls in rare_classes:
    idx = np.where(y_encoded == cls)[0][0]  # Get the index of the rare sample
    original_class_name = label_encoder.inverse_transform([cls])[0]  # Convert back to original label
    print(f"Duplicating class '{original_class_name}' (only 1 sample present).")

    X_processed = np.vstack([X_processed, X_processed[idx]])  # Duplicate features
    y_encoded = np.append(y_encoded, y_encoded[idx])  # Duplicate label

In [13]:
# 5. Train/test split - remove stratification if there are classes with too few samples
unique_counts = np.unique(y_encoded, return_counts=True)
min_samples = min(unique_counts[1])

if min_samples < 2:
    print(f"Warning: The least populated class has only {min_samples} sample(s). Removing stratification.")
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y_encoded, test_size=0.2, random_state=42
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

In [14]:
# Move GPU setup earlier
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [15]:
# Convert data to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [16]:
# Create dataset and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(
    train_dataset, 
    batch_size=256,  # Larger batch size
    shuffle=True, 
    num_workers=4,   # Parallel loading
    pin_memory=True  # Faster data transfer to GPU
)

In [17]:
# Compute class weights
print("Computing class weights...")
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

Computing class weights...


In [18]:
# 6. Define improved model architecture with proper input size
class ImprovedTagClassifier(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate=0.3):
        super(ImprovedTagClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.dropout(self.relu(self.bn3(self.fc3(x))))
        logits = self.fc4(x)
        return logits

In [19]:
# Initialize model
print("Initializing model...")
input_size = X_train.shape[1]
output_size = len(label_encoder.classes_)
model = ImprovedTagClassifier(input_size, output_size).to(device)

Initializing model...


In [20]:
# 7. Define loss function and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

In [21]:
# 8. Setup mixed precision training
scaler = GradScaler(device='cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
# 9. Training loop with timing and early stopping
print("Starting training...")
best_loss = float('inf')
patience = 10
counter = 0
early_stop = False
start_time = time.time()

num_epochs = 200
for epoch in range(num_epochs):
    epoch_start = time.time()
    model.train()
    epoch_loss = 0
    
    for batch_X, batch_y in train_loader:
        # Move batch to device
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        
        # Use mixed precision for faster training
        with torch.amp.autocast('cuda', enabled=device.type=='cuda'):
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
        
        # Scale gradients and optimize
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    scheduler.step(avg_loss)
    
    epoch_time = time.time() - epoch_start
    if (epoch + 1) % 5 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s")
    
    # Early stopping
    if avg_loss < best_loss:
        best_loss = avg_loss
        counter = 0
        torch.save(model.state_dict(), "best_tag_classifier.pth")
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            early_stop = True
    
    if early_stop:
        break

Starting training...
Epoch [5/200], Loss: 0.2753, Time: 6.59s
Epoch [10/200], Loss: 0.2147, Time: 5.70s
Epoch [15/200], Loss: 0.1849, Time: 5.73s
Epoch [20/200], Loss: 0.1714, Time: 7.34s
Epoch [25/200], Loss: 0.1581, Time: 6.30s
Epoch [30/200], Loss: 0.1434, Time: 6.02s
Epoch [35/200], Loss: 0.1414, Time: 6.10s
Epoch [40/200], Loss: 0.1365, Time: 6.47s
Epoch [45/200], Loss: 0.1132, Time: 7.43s
Epoch [50/200], Loss: 0.1134, Time: 7.42s
Epoch [55/200], Loss: 0.1020, Time: 7.17s
Epoch [60/200], Loss: 0.1086, Time: 5.97s
Epoch [65/200], Loss: 0.1011, Time: 6.20s
Epoch [70/200], Loss: 0.0992, Time: 6.67s
Epoch [75/200], Loss: 0.0933, Time: 6.75s
Epoch [80/200], Loss: 0.0933, Time: 5.82s
Epoch [85/200], Loss: 0.0853, Time: 7.24s
Epoch [90/200], Loss: 0.0921, Time: 5.59s
Epoch [95/200], Loss: 0.0786, Time: 5.20s
Epoch [100/200], Loss: 0.0760, Time: 5.20s
Epoch [105/200], Loss: 0.0714, Time: 5.63s
Epoch [110/200], Loss: 0.0737, Time: 5.33s
Epoch [115/200], Loss: 0.0715, Time: 5.63s
Epoch [120

In [23]:
# Save the trained model
torch.save(model.state_dict(), "tag_classifier.pth")
total_time = time.time() - start_time
print(f"Total training time: {total_time:.2f} seconds")

Total training time: 890.18 seconds


In [24]:
# 10. Evaluation on the test set
print("Evaluating model...")
model.load_state_dict(torch.load("best_tag_classifier.pth"))  # Load best model
model.eval()

# Process test data in batches for memory efficiency
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=256)

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        _, predictions = torch.max(outputs, 1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

y_pred = np.array(all_predictions)
y_test_np = np.array(all_labels)

accuracy = accuracy_score(y_test_np, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

Evaluating model...

Accuracy: 0.9399


In [25]:
print("\nClassification Report:")
print(classification_report(
    y_test_np, 
    y_pred,
    labels=np.unique(y_test_np),
    target_names=label_encoder.inverse_transform(np.unique(y_test_np))
))


Classification Report:
              precision    recall  f1-score   support

      BUTTON       0.26      0.56      0.35       336
         DIV       0.99      0.95      0.97     11831
       INPUT       0.74      0.89      0.81       117

    accuracy                           0.94     12284
   macro avg       0.66      0.80      0.71     12284
weighted avg       0.96      0.94      0.95     12284



In [None]:
# Print feature importances from the model
print("\nAnalyzing feature importance...")
with torch.no_grad():
    weights = model.fc1.weight.cpu().numpy()
    importance = np.abs(weights).mean(axis=0)
    
    # Get feature names (both categorical encoded and continuous)
    cat_feature_names = ohe.get_feature_names_out(categorical_cols)
    all_feature_names = np.concatenate([cat_feature_names, np.array(continuous_cols)])
    
    feature_importance = list(zip(all_feature_names, importance))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    
    print("\nTop 30 most important features:")
    for feature, imp in feature_importance[:30]:
        print(f"{feature}: {imp:.4f}")


Analyzing feature importance...

Top 30 most important features:
width: 0.4580
depth: 0.3321
is_leaf: 0.2601
aspect_ratio: 0.2353
distinct_background: 0.2286
nearest_image_size: 0.1840
total_descendants: 0.1440
total_text_nodes: 0.1341
total_text_length: 0.0990
text_embedding_319: 0.0886
text_embedding_376: 0.0779
text_embedding_145: 0.0774
text_embedding_213: 0.0768
text_embedding_290: 0.0768
text_embedding_0: 0.0767
text_embedding_82: 0.0765
text_embedding_222: 0.0761
text_embedding_94: 0.0758
text_embedding_226: 0.0754
text_embedding_199: 0.0753
text_embedding_128: 0.0753
text_embedding_190: 0.0751
text_embedding_32: 0.0748
text_embedding_101: 0.0745
text_embedding_291: 0.0744
text_embedding_184: 0.0743
text_embedding_52: 0.0741
text_embedding_312: 0.0740
text_embedding_20: 0.0740
text_embedding_162: 0.0740


: 