In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import joblib  # For saving encoders and scalers
from sentence_transformers import SentenceTransformer


In [17]:
# 1. Load dataset and remove rows with '-' in the tag column
df = pd.read_csv("../../feature_extraction/figma_dataset.csv")
df = df[~df['tag'].str.contains('-')]

unique_tags = df['tag'].unique().tolist()

# Define tag replacement rules
txt_tags = ['B', 'CAPTION', 'EM', 'FIGCAPTION', 'I', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 
            'LABEL', 'LI', 'TIME', 'TD', 'TH', 'U', 'P',  'SPAN', 'A', 'TXT', 'SMALL', 'ADDRESS', 'STRONG',
            'SUMMARY', 'SUP']

div_tags = ['ARTICLE', 'FIGURE', 'FOOTER', 'HEADER', 'MAIN', 'NAV', 'OL', 'UL', 'FORM', 'DETAILS', 'SECTION']


button_tags = ['SELECT']

# Apply replacements
df['tag'] = df['tag'].apply(
    lambda x: 'TEXT' if x in txt_tags else 
              'DIV' if x in div_tags else 
              'BUTTON' if x in div_tags else 
              x
)

allowed_tags = ["DIV", "BUTTON", "INPUT"]
df = df[df["tag"].isin(allowed_tags)]
df = df.reset_index(drop=True)


columns_to_drop = [
    'type', 'num_children', 'parent_tag', 'parent_tag_html', 'prev_sibling_tag', 'parent_prev_sibling_tag', 
    'sibling_count', 'has_background_color', 'text_length', 'nearest_text_distance', 'nearest_image_distance']

# Safely drop columns that exist
for col in columns_to_drop:
    if col in df.columns:
        df = df.drop(columns=[col])

# Save the modified dataframe to a new Excel file
df.to_csv('cleaned_data.csv', index=False)

In [18]:

# 4. Generate embeddings

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  
text_contents = df['nearest_text_content'].fillna('').tolist()


print("Generating embeddings for nearest_text_content...")
text_embeddings = sentence_model.encode(text_contents, show_progress_bar=True)
print(f"Embedding shape: {text_embeddings.shape}")  # Should be (n_samples, embedding_dim)

# 5. Create column names for the embeddings
embedding_dim = text_embeddings.shape[1]
embedding_cols = [f'text_embedding_{i}' for i in range(embedding_dim)]

# 6. Add embeddings to the dataframe
embedding_df = pd.DataFrame(text_embeddings, columns=embedding_cols)
df_with_embeddings = pd.concat([df, embedding_df], axis=1)

# 7. Drop the original text content column since we now have embeddings
df_with_embeddings = df_with_embeddings.drop(columns=['nearest_text_content'])

# 8. Optionally, handle the nearest_image_size column
# If it's not numeric, you might want to process it separately or drop it
if 'nearest_image_size' in df_with_embeddings.columns and df_with_embeddings['nearest_image_size'].dtype == 'object':
    # If it contains dimensions like '324x240', extract features
    try:
        # Try to convert to numeric directly
        df_with_embeddings['nearest_image_size'] = pd.to_numeric(df_with_embeddings['nearest_image_size'])
    except:
        # If that fails, it might be in the format of dimensions, so drop it or process it
        df_with_embeddings = df_with_embeddings.drop(columns=['nearest_image_size'])

# 9. Save the dataframe with embeddings
df = df_with_embeddings

# 10. Now continue with model training as before

Generating embeddings for nearest_text_content...


Batches: 100%|██████████| 1920/1920 [00:23<00:00, 80.41it/s] 


Embedding shape: (61420, 384)


In [19]:
# 2. Separate features and target
y = df["tag"]
X = df.drop(columns=["tag"])

In [20]:
# 3. Identify categorical and continuous columns
categorical_cols = []
continuous_cols = [col for col in X.columns if col not in categorical_cols]

In [21]:
# Process categorical features with LabelEncoder
for col in categorical_cols:
    X[col] = X[col].astype(str)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    # If you need to save individual encoders, consider saving them in a dictionary.

In [22]:
# Fill missing values in continuous columns and scale them
X[continuous_cols] = X[continuous_cols].fillna(0)
scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X[continuous_cols])
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [23]:
# Replace continuous columns in X with their scaled values
X_scaled = X.copy()
X_scaled[continuous_cols] = X_continuous_scaled

In [24]:
# 4. Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [25]:
# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [26]:
# Convert data to PyTorch tensors
# Note: X_train and X_test are DataFrames, so use .values to convert to NumPy arrays.
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [27]:
# 7. Define the Neural Network Model with non-linear activations between linear layers
class TagClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(TagClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # First hidden layer
        self.fc2 = nn.Linear(64, 128)           # Second hidden layer
        self.fc3 = nn.Linear(128, 256)            # Third hidden layer
        self.fc4 = nn.Linear(256, 512)            # Fourth hidden layer
        self.fc5 = nn.Linear(512, 512)            # Fifth hidden layer
        self.fc6 = nn.Linear(512, 512)            # Sixth hidden layer
        self.fc7 = nn.Linear(512, 256)            # Seventh hidden layer
        self.fc8 = nn.Linear(256, 128)           # Eighth hidden layer
        self.fc9 = nn.Linear(128, output_size)   # Output layer
        self.relu = nn.ReLU()                  # Non-linear activation

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.relu(self.fc5(x))
        x = self.relu(self.fc6(x))
        x = self.relu(self.fc7(x))
        x = self.relu(self.fc8(x))
        logits = self.fc9(x)  # No activation here: CrossEntropyLoss expects raw logits.
        return logits

In [28]:
# Initialize model
input_size = X_train_tensor.shape[1]
output_size = len(label_encoder.classes_)
model = TagClassifier(input_size, output_size)

In [29]:
# 8. Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Internally applies softmax on logits
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
# 9. Training loop



num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [10/100], Loss: 0.1395
Epoch [20/100], Loss: 0.1375
Epoch [30/100], Loss: 0.1355
Epoch [40/100], Loss: 0.1336
Epoch [50/100], Loss: 0.1317
Epoch [60/100], Loss: 0.1296
Epoch [70/100], Loss: 0.1276
Epoch [80/100], Loss: 0.1254
Epoch [90/100], Loss: 0.1233
Epoch [100/100], Loss: 0.1211


: 

In [31]:
# Save the trained model
torch.save(model.state_dict(), "tag_classifier.pth")

In [32]:
# 10. Evaluation on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).numpy()

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 0.9604


In [33]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred,
                            labels=np.unique(y_test),
                            target_names=label_encoder.inverse_transform(np.unique(y_test))))


Classification Report:
              precision    recall  f1-score   support

      BUTTON       0.00      0.00      0.00       358
         DIV       0.96      1.00      0.98     11798
       INPUT       0.00      0.00      0.00       128

    accuracy                           0.96     12284
   macro avg       0.32      0.33      0.33     12284
weighted avg       0.92      0.96      0.94     12284



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
