In [15]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [16]:
# Load dataset with appropriate encoding to avoid decoding errors
df = pd.read_csv('figma_dataset2.csv', encoding='latin1')

In [17]:
# Display first few rows to understand the structure
print(df.head())

      tag   type       x       y  width   height characters  depth  \
0    BODY  FRAME       0       0  800.0  10577.0        NaN      0   
1     DIV  GROUP       0       0  800.0      0.0        NaN      1   
2     DIV  GROUP -100000 -100000  800.0    600.0        NaN      2   
3  IFRAME  GROUP -100000 -100000  800.0    600.0        NaN      3   
4     DIV  GROUP       0       0  800.0  10577.0        NaN      1   

   num_children parent_tag  ...  x_normalized  y_normalized  x_center  \
0             3        NaN  ...        100000        100000     400.0   
1             1       BODY  ...        100000        100000     400.0   
2             1        DIV  ...             0             0  -99600.0   
3             0        DIV  ...             0             0  -99600.0   
4             1       BODY  ...        100000        100000     400.0   

  y_center x_quarter y_quarter  aspect_ratio       area normalized_width  \
0   5288.5       0.5  0.500000      0.075636  8461600.0         

In [18]:
# Define column categories based on the dataset attributes
categorical_cols = ['type', 'characters', 'parent_tag', 'text_type', 'visibility', 'border_type', 'border_pattern', 'shadow_type']
numerical_cols = ['x', 'y', 'width', 'height', 'depth', 'num_children', 'sibling_count', 'is_leaf', 'font_size', 'font_weight',
                   'color', 'background_color', 'border_radius', 'border_color', 'border_opacity', 'border_weight', 'shadow_color',
                   'shadow_offset', 'shadow_radius', 'text_length', 'word_count', 'contains_number', 'contains_special_chars',
                   'x_normalized', 'y_normalized', 'x_center', 'y_center', 'x_quarter', 'y_quarter', 'aspect_ratio', 'area',
                   'normalized_width', 'normalized_height']

In [19]:
# Drop non-numeric values from numerical columns to prevent conversion errors
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [20]:
# Fill missing values with appropriate replacements
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [21]:
# Ensure all categorical columns are treated as strings before encoding
df[categorical_cols] = df[categorical_cols].astype(str)

In [22]:
# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [23]:
# Encode target variable
tag_encoder = LabelEncoder()
df['tag'] = tag_encoder.fit_transform(df['tag'])

In [24]:
# Split dataset into features and target
X = df.drop(columns=['tag'])  # Features
y = df['tag']  # Target variable

In [25]:
# Replace infinite values with NaN
X[numerical_cols] = X[numerical_cols].replace([np.inf, -np.inf], np.nan)

# Normalize numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [26]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Check if a pre-trained model exists
model_filename = "models/html_tag_model.pkl"
scaler_filename = "models/scaler.pkl"
tag_encoder_filename = "models/tag_encoder.pkl"
label_encoders_filename = "models/label_encoders.pkl"

try:
    model = joblib.load(model_filename)
    scaler = joblib.load(scaler_filename)
    tag_encoder = joblib.load(tag_encoder_filename)
    label_encoders = joblib.load(label_encoders_filename)
    print("Loaded pre-trained model.")
except FileNotFoundError:
    print("No pre-trained model found, training a new one.")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Save the trained model, scaler, and encoder
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(tag_encoder, tag_encoder_filename)
    joblib.dump(label_encoders, label_encoders_filename)
    print("Model saved for future use.")

No pre-trained model found, training a new one.
Model saved for future use.


In [28]:
# Predict and evaluate model performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=tag_encoder.inverse_transform(np.unique(y_test))))

Accuracy: 0.9532
                                           precision    recall  f1-score   support

                                        A       1.00      1.00      1.00      2412
                                  ADDRESS       1.00      0.89      0.94        28
                                  ARTICLE       0.99      0.89      0.94       136
                                    ASIDE       0.00      0.00      0.00         4
                                        B       1.00      0.36      0.53        14
                               BLOCKQUOTE       0.00      0.00      0.00         1
                                     BODY       1.00      0.94      0.97        16
                                   BUTTON       0.95      0.96      0.95       242
                                   CANVAS       1.00      1.00      1.00        14
                                     CITE       1.00      1.00      1.00         1
                                       DD       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
