In [108]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [109]:
# Load dataset with appropriate encoding to avoid decoding errors
df_all = pd.read_csv('figma_dataset.csv')

df = df_all[~df_all['tag'].str.contains('-', na=False)]
df = df[~df['tag'].str.contains('BODY', na=False)]

df.to_csv('cleaned_figma_dataset.csv', index=False)

In [110]:
# Display first few rows to understand the structure
print(df.head())

   tag         type     width    height characters  has_text  depth  \
1  DIV        GROUP  0.990183  0.000000        NaN         0      1   
2  DIV        GROUP  0.990183  1.000000        NaN         0      1   
3  DIV        GROUP  0.990183  1.000000        NaN         0      2   
4  DIV        GROUP  0.990183  0.000000        NaN         0      3   
5    A  LINK_UNFURL  0.090969  0.004207        NaN         0      4   

   num_children parent_tag  sibling_count  ...  text_length  word_count  \
1      0.000000      FRAME       0.117647  ...          0.0         0.0   
2      0.055556      FRAME       0.117647  ...          0.0         0.0   
3      0.444444      GROUP       0.000000  ...          0.0         0.0   
4      0.055556      GROUP       0.411765  ...          0.0         0.0   
5      0.055556      GROUP       0.000000  ...          0.0         0.0   

   contains_number contains_special_chars  x_quarter  y_quarter  aspect_ratio  \
1                0                      0

In [111]:
# Define column categories based on the dataset attributes
categorical_cols = ["type", "parent_tag","characters", "font_name"]  # Adjust as needed
numerical_cols = ['width', 'height', 'has_text', 'depth', 'num_children', 'sibling_count', 'is_leaf', 'font_size', 'has_font_size',
                   'has_text_color', 'color_r', 'color_g', 'color_b', 'has_background_color', 'background_r', 'background_g',
                   'background_b', 'border_radius', 'border_r', 'border_g', 'border_b', 'border_weight',
                   'has_shadow', 'shadow_r', 'shadow_g', 'shadow_b','shadow_radius', 'text_length', 'word_count', 'contains_number', 'contains_special_chars', 'has_border', 'border_opacity', 'x_quarter', 'y_quarter', 'aspect_ratio', 'area',
                   'normalized_width', 'normalized_height']

In [112]:
# Drop non-numeric values from numerical columns to prevent conversion errors
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [113]:
# Fill missing values with appropriate replacements
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [114]:
# Ensure all categorical columns are treated as strings before encoding
df[categorical_cols] = df[categorical_cols].astype(str)

In [115]:
# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [116]:
# Encode target variable
tag_encoder = LabelEncoder()
df['tag'] = tag_encoder.fit_transform(df['tag'])

In [117]:
# Split dataset into features and target
X = df.drop(columns=['tag'])  # Features
y = df['tag']  # Target variable

In [118]:
# Normalize numerical features to improve model performance
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [119]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
# Check if a pre-trained model exists
model_filename = "models/html_tag_model.pkl"
scaler_filename = "models/scaler.pkl"
tag_encoder_filename = "models/tag_encoder.pkl"
label_encoders_filename = "models/label_encoders.pkl"

try:
    model = joblib.load(model_filename)
    scaler = joblib.load(scaler_filename)
    tag_encoder = joblib.load(tag_encoder_filename)
    label_encoders = joblib.load(label_encoders_filename)
    print("Loaded pre-trained model.")
except FileNotFoundError:
    print("No pre-trained model found, training a new one.")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Save the trained model, scaler, and encoder
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(tag_encoder, tag_encoder_filename)
    joblib.dump(label_encoders, label_encoders_filename)
    print("Model saved for future use.")

No pre-trained model found, training a new one.
Model saved for future use.


In [121]:
# Predict and evaluate model performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=tag_encoder.inverse_transform(np.unique(y_test))))

Accuracy: 0.9566
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      3140
        ABBR       1.00      0.50      0.67         2
     ADDRESS       0.85      1.00      0.92        11
     ARTICLE       0.99      0.93      0.96       196
       ASIDE       1.00      0.27      0.43        11
           B       1.00      0.50      0.67         6
  BLOCKQUOTE       0.00      0.00      0.00         1
      BUTTON       0.95      0.94      0.95       331
      CANVAS       0.00      0.00      0.00         1
        CITE       0.00      0.00      0.00         1
          DD       1.00      1.00      1.00         1
         DEL       1.00      1.00      1.00         8
     DETAILS       1.00      1.00      1.00         2
         DIV       0.94      0.99      0.97     10672
          DL       0.00      0.00      0.00         3
          DT       1.00      1.00      1.00         1
          EM       0.80      0.27      0.40        15
    FIELDS

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
