In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
# Load dataset with appropriate encoding to avoid decoding errors
df_all = pd.read_csv('figma_dataset.csv')

df = df_all[~df_all['tag'].str.contains('-', na=False)]



In [None]:
# Assume 'tag' is the target variable
y = df["tag"]
X = df.drop(columns=["tag"])


In [4]:
# Identify categorical and numerical columns
categorical_cols = []  # adjust as needed
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [5]:
# Create a preprocessor:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        (
            "cat",
            Pipeline(steps=[
                ("ord", OrdinalEncoder()),
                ("scaler", StandardScaler())
            ]),
            categorical_cols
        )
    ]
)

In [6]:
# Apply the preprocessor to our features
X_processed = preprocessor.fit_transform(X)

In [7]:
# Encode target labels
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

In [8]:
# Save the preprocessor and target encoder for reuse later
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(target_encoder, "target_encoder.pkl")

['target_encoder.pkl']

In [9]:
# Train/test split
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)

In [10]:
# Convert to PyTorch tensors.
# Note: OneHotEncoder returns a sparse matrix by default, so we convert to dense.
if hasattr(X_train, "toarray"):
    X_train = X_train.toarray()
    X_test = X_test.toarray()

In [11]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [12]:
# Define Neural Network Model with dropout for improved regularization
class TagClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(TagClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(32, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [13]:
input_size = X_train_tensor.shape[1]
output_size = len(target_encoder.classes_)
model = TagClassifier(input_size, output_size)

In [14]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [15]:
# Training loop
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

# Save the model state_dict
torch.save(model.state_dict(), "tag_classifier.pth")

Epoch [10/200], Loss: 4.1646
Epoch [20/200], Loss: 3.8527
Epoch [30/200], Loss: 3.3253
Epoch [40/200], Loss: 2.5774
Epoch [50/200], Loss: 2.0029
Epoch [60/200], Loss: 1.8045
Epoch [70/200], Loss: 1.6527
Epoch [80/200], Loss: 1.5380
Epoch [90/200], Loss: 1.4441
Epoch [100/200], Loss: 1.3683
Epoch [110/200], Loss: 1.3044
Epoch [120/200], Loss: 1.2516
Epoch [130/200], Loss: 1.2105
Epoch [140/200], Loss: 1.1735
Epoch [150/200], Loss: 1.1406
Epoch [160/200], Loss: 1.1158
Epoch [170/200], Loss: 1.0974
Epoch [180/200], Loss: 1.0758
Epoch [190/200], Loss: 1.0624
Epoch [200/200], Loss: 1.0467


In [16]:

# Evaluation
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred_classes = torch.argmax(y_pred, dim=1).numpy()

accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Accuracy: {accuracy:.4f}')

print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes, target_names=target_encoder.inverse_transform(np.unique(y_test))))

Accuracy: 0.7520

Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      3118
     ADDRESS       0.00      0.00      0.00        16
     ARTICLE       0.00      0.00      0.00       178
       ASIDE       0.00      0.00      0.00        12
           B       0.00      0.00      0.00         6
        BODY       0.00      0.00      0.00        22
      BUTTON       1.00      0.07      0.14       357
      CANVAS       0.00      0.00      0.00         2
        DATA       0.00      0.00      0.00         1
          DD       0.00      0.00      0.00         1
         DEL       0.00      0.00      0.00         7
     DETAILS       0.00      0.00      0.00         1
         DIV       0.75      0.99      0.85     10568
          DL       0.00      0.00      0.00         2
          DT       0.00      0.00      0.00         2
          EM       0.00      0.00      0.00        12
  FIGCAPTION       0.00      0.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
