In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load dataset with appropriate encoding to avoid decoding errors
df = pd.read_csv('figma_optimized_dataset.csv')

# df = df_all[~df_all['tag'].str.contains('-', na=False)]
# df = df[~df['tag'].str.contains('BODY', na=False)]

# df.to_csv('cleaned_figma_dataset.csv', index=False)

In [3]:
# Display first few rows to understand the structure
print(df.head())

         PC1        PC2       PC8       PC7      PC10      PC11       PC4  \
0 -10.456620  19.955786 -1.254755 -4.230132  1.764622  1.261263  5.250890   
1  -1.346738   1.510911 -3.105211  1.343486 -0.117786 -1.069568 -2.537830   
2  -9.924718  19.835341 -1.896483 -4.941873  2.530880  1.099762  5.110145   
3 -11.506617  19.986284  1.212070 -3.108289  0.270314  0.775446  5.773297   
4  -2.393656   1.039023 -1.591040  1.379942 -0.078820 -1.282428 -2.578143   

        PC6       PC5       PC3   tag  
0  0.086313 -0.323922 -4.935746  BODY  
1 -0.378217  0.027428  1.778215   DIV  
2 -0.038850 -0.356529 -4.959775   DIV  
3  0.263192 -0.209072 -5.113504   DIV  
4 -0.558794  0.218324  1.756876   DIV  


In [4]:
# Define column categories based on the dataset attributes
categorical_cols = []  # Adjust as needed
numerical_cols = ['width', 'height', 'has_text', 'depth', 'num_children', 'sibling_count', 'is_leaf',
                   'has_background_color',  'text_length', 'x_quarter', 'y_quarter', 'aspect_ratio', 'area']

In [5]:
# Drop non-numeric values from numerical columns to prevent conversion errors
# df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [6]:
# Fill missing values with appropriate replacements
# df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [7]:
# Ensure all categorical columns are treated as strings before encoding
df[categorical_cols] = df[categorical_cols].astype(str)

In [8]:
# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [9]:
# Encode target variable
tag_encoder = LabelEncoder()
df['tag'] = tag_encoder.fit_transform(df['tag'])

In [10]:
# Split dataset into features and target
X = df.drop(columns=['tag'])  # Features
y = df['tag']  # Target variable

In [11]:
# Normalize numerical features to improve model performance
scaler = StandardScaler()
# X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
X = scaler.fit_transform(X)


In [12]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Check if a pre-trained model exists
model_filename = "models/html_tag_model.pkl"
scaler_filename = "models/scaler.pkl"
tag_encoder_filename = "models/tag_encoder.pkl"
label_encoders_filename = "models/label_encoders.pkl"

try:
    model = joblib.load(model_filename)
    scaler = joblib.load(scaler_filename)
    tag_encoder = joblib.load(tag_encoder_filename)
    label_encoders = joblib.load(label_encoders_filename)
    print("Loaded pre-trained model.")
except FileNotFoundError:
    print("No pre-trained model found, training a new one.")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Save the trained model, scaler, and encoder
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(tag_encoder, tag_encoder_filename)
    joblib.dump(label_encoders, label_encoders_filename)
    print("Model saved for future use.")

No pre-trained model found, training a new one.
Model saved for future use.


In [14]:
# Predict and evaluate model performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=tag_encoder.inverse_transform(np.unique(y_test))))

Accuracy: 0.8841
              precision    recall  f1-score   support

           A       0.89      0.84      0.86      3118
     ADDRESS       0.88      0.88      0.88        16
     ARTICLE       0.97      0.87      0.91       178
       ASIDE       0.00      0.00      0.00        12
           B       1.00      0.17      0.29         6
        BODY       1.00      0.82      0.90        22
      BUTTON       0.90      0.72      0.80       357
      CANVAS       0.00      0.00      0.00         2
        DATA       1.00      1.00      1.00         1
          DD       1.00      1.00      1.00         1
         DEL       1.00      0.71      0.83         7
     DETAILS       1.00      1.00      1.00         1
         DIV       0.88      0.96      0.92     10568
          DL       0.00      0.00      0.00         2
          DT       0.50      0.50      0.50         2
          EM       1.00      0.08      0.15        12
  FIGCAPTION       1.00      0.41      0.58        22
      FIGU

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
