In [113]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import joblib  # For saving encoders and scalers

In [114]:
# 1. Load dataset and remove rows with '-' in the tag column
df = pd.read_csv("figma_dataset.csv")
df = df[~df['tag'].str.contains('-')]

In [115]:
# 2. Separate features and target
y = df["tag"]
X = df.drop(columns=["tag"])

In [116]:
# 3. Identify categorical and continuous columns
categorical_cols = []
continuous_cols = [col for col in X.columns if col not in categorical_cols]

In [117]:
# Process categorical features with LabelEncoder
for col in categorical_cols:
    X[col] = X[col].astype(str)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    # If you need to save individual encoders, consider saving them in a dictionary.

In [118]:
# Fill missing values in continuous columns and scale them
X[continuous_cols] = X[continuous_cols].fillna(0)
scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X[continuous_cols])
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [119]:
# Replace continuous columns in X with their scaled values
X_scaled = X.copy()
X_scaled[continuous_cols] = X_continuous_scaled

In [120]:
# 4. Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [121]:
# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [122]:
# Convert data to PyTorch tensors
# Note: X_train and X_test are DataFrames, so use .values to convert to NumPy arrays.
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [123]:
# 7. Define the Neural Network Model with non-linear activations between linear layers
class TagClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(TagClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)  # First hidden layer
        self.fc2 = nn.Linear(64, 128)           # Second hidden layer
        self.fc3 = nn.Linear(128, 256)            # Third hidden layer
        # self.fc4 = nn.Linear(256, 512)            # Fourth hidden layer
        # self.fc5 = nn.Linear(512, 512)            # Fifth hidden layer
        # self.fc6 = nn.Linear(512, 512)            # Sixth hidden layer
        # self.fc7 = nn.Linear(512, 256)            # Seventh hidden layer
        self.fc8 = nn.Linear(256, 128)           # Eighth hidden layer
        self.fc9 = nn.Linear(128, output_size)   # Output layer
        self.relu = nn.ReLU()                  # Non-linear activation

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        # x = self.relu(self.fc4(x))
        # x = self.relu(self.fc5(x))
        # x = self.relu(self.fc6(x))
        # x = self.relu(self.fc7(x))
        x = self.relu(self.fc8(x))
        logits = self.fc9(x)  # No activation here: CrossEntropyLoss expects raw logits.
        return logits

In [124]:
# Initialize model
input_size = X_train_tensor.shape[1]
output_size = len(label_encoder.classes_)
model = TagClassifier(input_size, output_size)

In [125]:
# 8. Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Internally applies softmax on logits
optimizer = optim.Adam(model.parameters(), lr=0.00001)

In [126]:
# Create a TensorDataset from the training tensors
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

# Define a DataLoader with a chosen batch size (e.g., 64)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [127]:
# 9. Training loop

# Create a DataLoader with mini-batches
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * batch_X.size(0)  # Accumulate loss weighted by batch size

    # Compute average loss for the epoch
    avg_loss = epoch_loss / len(train_dataset)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")


Epoch [10/100], Loss: 0.0763
Epoch [20/100], Loss: 0.0068
Epoch [30/100], Loss: 0.0008
Epoch [40/100], Loss: 0.0001
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000


In [128]:
# Save the trained model
torch.save(model.state_dict(), "tag_classifier.pth")

In [129]:
# 10. Evaluation on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).numpy()

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 0.9997


In [130]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred,
                            labels=np.unique(y_test),
                            target_names=label_encoder.inverse_transform(np.unique(y_test))))


Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      3118
     ADDRESS       1.00      1.00      1.00        16
     ARTICLE       1.00      1.00      1.00       178
       ASIDE       1.00      0.83      0.91        12
           B       1.00      1.00      1.00         6
        BODY       1.00      1.00      1.00        22
      BUTTON       1.00      1.00      1.00       357
      CANVAS       1.00      1.00      1.00         2
        DATA       1.00      1.00      1.00         1
          DD       1.00      1.00      1.00         1
         DEL       1.00      1.00      1.00         7
     DETAILS       1.00      1.00      1.00         1
         DIV       1.00      1.00      1.00     10568
          DL       1.00      1.00      1.00         2
          DT       1.00      1.00      1.00         2
          EM       1.00      1.00      1.00        12
  FIGCAPTION       1.00      1.00      1.00        22
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
