In [45]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [46]:
# Load dataset with appropriate encoding to avoid decoding errors
df = pd.read_csv('figma_dataset.csv')

# df = df_all[~df_all['tag'].str.contains('-', na=False)]
# df = df[~df['tag'].str.contains('BODY', na=False)]

# df.to_csv('cleaned_figma_dataset.csv', index=False)

In [47]:
# Display first few rows to understand the structure
print(df.head())

   has_text  num_children  text_length  is_leaf  tag_A  tag_ABBR  \
0         0             3            0        0      0         0   
1         0             0            0        1      0         0   
2         0             1            0        0      0         0   
3         0             8            0        0      0         0   
4         0             1            0        0      0         0   

   tag_ACTIVATE-FEATURE  tag_AD-EVENT-TRACKER  tag_AD-UNIT  \
0                     0                     0            0   
1                     0                     0            0   
2                     0                     0            0   
3                     0                     0            0   
4                     0                     0            0   

   tag_AD-UNIT-MANAGER  ...  parent_tag_html_THEAD.1  \
0                    0  ...                        0   
1                    0  ...                        0   
2                    0  ...                       

In [48]:
# Define column categories based on the dataset attributes
categorical_cols = []  # Adjust as needed
numerical_cols = ['width', 'height', 'has_text', 'depth', 'num_children', 'sibling_count', 'is_leaf',
                   'has_background_color',  'text_length', 'x_quarter', 'y_quarter', 'aspect_ratio', 'area']

In [49]:
# Drop non-numeric values from numerical columns to prevent conversion errors
# df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [50]:
# Fill missing values with appropriate replacements
# df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [51]:
# Ensure all categorical columns are treated as strings before encoding
df[categorical_cols] = df[categorical_cols].astype(str)

In [52]:
# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [53]:
# Encode target variable
tag_encoder = LabelEncoder()
df['tag'] = tag_encoder.fit_transform(df['tag'])

In [54]:
# Split dataset into features and target
X = df.drop(columns=['tag'])  # Features
y = df['tag']  # Target variable

In [55]:
# Normalize numerical features to improve model performance
scaler = StandardScaler()
# X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
X = scaler.fit_transform(X)


In [56]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
# Check if a pre-trained model exists
model_filename = "models/html_tag_model.pkl"
scaler_filename = "models/scaler.pkl"
tag_encoder_filename = "models/tag_encoder.pkl"
label_encoders_filename = "models/label_encoders.pkl"

try:
    model = joblib.load(model_filename)
    scaler = joblib.load(scaler_filename)
    tag_encoder = joblib.load(tag_encoder_filename)
    label_encoders = joblib.load(label_encoders_filename)
    print("Loaded pre-trained model.")
except FileNotFoundError:
    print("No pre-trained model found, training a new one.")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Save the trained model, scaler, and encoder
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(tag_encoder, tag_encoder_filename)
    joblib.dump(label_encoders, label_encoders_filename)
    print("Model saved for future use.")

No pre-trained model found, training a new one.
Model saved for future use.


In [58]:
# Predict and evaluate model performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=tag_encoder.inverse_transform(np.unique(y_test))))

Accuracy: 0.9990
                                           precision    recall  f1-score   support

                                        A       1.00      1.00      1.00      3120
                                     ABBR       0.67      0.67      0.67         3
                                  ADDRESS       0.94      1.00      0.97        17
ADS-HOME-PAGE-EDITORIAL-SPOTLIGHT-MANAGER       0.00      0.00      0.00         1
     ADS-HOME-PAGE-FEATURED-MEDIA-MANAGER       0.00      0.00      0.00         1
                                  ARTICLE       0.99      1.00      1.00       183
                                    ASIDE       0.90      1.00      0.95         9
          AUTH-FLOW-GOOGLE-ONE-TAP-PROMPT       0.00      0.00      0.00         1
                                        B       1.00      0.86      0.92         7
                                     BODY       1.00      1.00      1.00        20
                                   BUTTON       1.00      1.00      1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
