In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import joblib  # For saving encoders and scalers

In [2]:
# 1. Load dataset and remove rows with '-' in the tag column
df = pd.read_csv("figma_full_dataset.csv")
df = df[~df['tag'].str.contains('-')]

In [3]:
# 2. Separate features and target
y = df["tag"]
X = df.drop(columns=["tag"])

In [4]:
# 3. Identify categorical and continuous columns
categorical_cols = ['type','parent_tag','parent_tag_html']
continuous_cols = [col for col in X.columns if col not in categorical_cols]

In [5]:
# Process categorical features with LabelEncoder
for col in categorical_cols:
    X[col] = X[col].astype(str)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    # If you need to save individual encoders, consider saving them in a dictionary.

In [6]:
# Fill missing values in continuous columns and scale them
X[continuous_cols] = X[continuous_cols].fillna(0)
scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X[continuous_cols])
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [7]:
# Replace continuous columns in X with their scaled values
X_scaled = X.copy()
X_scaled[continuous_cols] = X_continuous_scaled

In [8]:
# 4. Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [9]:
# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [10]:
# Convert data to PyTorch tensors
# Note: X_train and X_test are DataFrames, so use .values to convert to NumPy arrays.
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [11]:
class TagClassifier(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate=0.2):
        super(TagClassifier, self).__init__()
        # Encoder path
        self.fc1 = nn.Linear(input_size, 64)    # First hidden layer
        self.fc2 = nn.Linear(64, 128)           # Second hidden layer
        self.fc3 = nn.Linear(128, 256)          # Third hidden layer
        self.fc4 = nn.Linear(256, 512)          # Fourth hidden layer
        
        # Bottleneck
        self.fc5 = nn.Linear(512, 512)          # Fifth hidden layer
        self.fc6 = nn.Linear(512, 512)          # Sixth hidden layer
        
        # Decoder path
        self.fc7 = nn.Linear(512, 256)          # Seventh hidden layer
        self.fc8 = nn.Linear(256, 128)          # Eighth hidden layer
        self.fc9 = nn.Linear(128, output_size)  # Output layer
        
        self.relu = nn.ReLU()                   # Non-linear activation
        self.dropout = nn.Dropout(dropout_rate) # Dropout layer
        self.dropout_heavy = nn.Dropout(0.5)    # Heavier dropout for bottleneck

    def forward(self, x):
        # Encoder path with light dropout
        x = self.relu(self.fc1(x))
        
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        
        # Bottleneck with heavier dropout
        x = self.relu(self.fc5(x))
        x = self.dropout_heavy(x)
        
        x = self.relu(self.fc6(x))
        x = self.dropout_heavy(x)
        
        # Decoder path with decreasing dropout
        x = self.relu(self.fc7(x))
        x = self.dropout(x)
        
        x = self.relu(self.fc8(x))
        
        logits = self.fc9(x)  # No activation or dropout here
        return logits

In [12]:
# Initialize model
input_size = X_train_tensor.shape[1]
output_size = len(label_encoder.classes_)
model = TagClassifier(input_size, output_size)

In [13]:
# 8. Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Internally applies softmax on logits
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [14]:
# Create a TensorDataset from the training tensors
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

# Define a DataLoader with a chosen batch size (e.g., 64)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:

# Check if GPU is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to the device
model = model.to(device)

# Move data to the device
X_train_tensor = X_train_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

# Save the model state_dict
torch.save(model.state_dict(), "tag_classifier.pth")

# If you need to use the model for evaluation on CPU later
# model = model.to("cpu")

Using device: cuda
Epoch [10/100], Loss: 2.8165
Epoch [20/100], Loss: 2.2848


In [None]:
# Save the trained model
torch.save(model.state_dict(), "tag_classifier.pth")

In [None]:
# 10. Evaluation on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).numpy()

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 0.5560


In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred,
                            labels=np.unique(y_test),
                            target_names=label_encoder.inverse_transform(np.unique(y_test))))


Classification Report:
              precision    recall  f1-score   support

           A       0.00      0.00      0.00      6999
        ABBR       0.00      0.00      0.00       244
     ADDRESS       0.00      0.00      0.00        21
     ARTICLE       0.00      0.00      0.00       261
       ASIDE       0.00      0.00      0.00        12
           B       0.00      0.00      0.00       152
  BLOCKQUOTE       0.00      0.00      0.00         1
        BODY       0.00      0.00      0.00        75
      BUTTON       0.00      0.00      0.00      1994
      CANVAS       0.00      0.00      0.00         5
     CAPTION       0.00      0.00      0.00        35
         CNX       0.00      0.00      0.00         2
          DD       0.00      0.00      0.00         1
         DEL       0.00      0.00      0.00        10
     DETAILS       0.00      0.00      0.00        85
      DIALOG       0.00      0.00      0.00         1
         DIV       0.60      0.98      0.74     47326
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
