In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb

In [2]:
# Load dataset with appropriate encoding to avoid decoding errors
df = pd.read_csv('figma_dataset3.csv', encoding='latin1')
print(len(df))

  df = pd.read_csv('figma_dataset3.csv', encoding='latin1')


7641391


In [3]:
# Display first few rows to understand the structure
print(df.head())

    tag   type  x  y  width   height characters  depth  num_children  \
0  BODY  FRAME  0  0  800.0  11403.0        NaN      0             3   
1   DIV  GROUP  0  0  800.0      0.0        NaN      1             0   
2   DIV  GROUP  0  0  800.0  11403.0        NaN      1             1   
3   DIV  GROUP  0  0  800.0  11403.0        NaN      2             8   
4   DIV  GROUP  0  0  800.0      0.0        NaN      3             1   

  parent_tag  ...  x_normalized  y_normalized  x_center y_center x_quarter  \
0        NaN  ...             1          9999     400.0   5701.5       0.5   
1       BODY  ...             1          9999     400.0      0.0       0.5   
2       BODY  ...             1          9999     400.0   5701.5       0.5   
3        DIV  ...             1          9999     400.0   5701.5       0.5   
4        DIV  ...             1          9999     400.0      0.0       0.5   

  y_quarter  aspect_ratio       area normalized_width normalized_height  
0       0.5      0.07015

In [4]:
# Define column categories based on the dataset attributes
categorical_cols = ['type', 'characters', 'parent_tag', 'is_leaf','font_weight','color','contains_special_chars','background_color','border_color','shadow_color','contains_number', 'text_type', 'visibility', 'border_type', 'border_pattern', 'shadow_type','shadow_offset']
numerical_cols = ['x', 'y', 'width', 'height', 'depth', 'num_children', 'sibling_count', 'font_size', 'border_radius',  'border_opacity', 'border_weight', 'shadow_radius', 'text_length', 'word_count',  'x_normalized', 'y_normalized', 'x_center', 'y_center', 'x_quarter', 'y_quarter', 'aspect_ratio', 'area','normalized_width', 'normalized_height']

In [5]:
# Drop non-numeric values from numerical columns to prevent conversion errors
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [6]:
# Ensure all categorical columns are treated as strings before encoding
df[categorical_cols] = df[categorical_cols].astype(str)

In [7]:
# Encode categorical features using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [8]:
# Encode target variable
tag_encoder = LabelEncoder()
df['tag'] = tag_encoder.fit_transform(df['tag'])

In [9]:
# Split dataset into features and target
X = df.drop(columns=['tag'])  # Features
y = df['tag']  # Target variable

In [10]:
# Replace infinite values with NaN
X[numerical_cols] = X[numerical_cols].replace([np.inf, -np.inf], np.nan)
X[numerical_cols] = X[numerical_cols].fillna(0)

In [11]:
# print(X[numerical_cols].isnull().sum())
# print(df[numerical_cols].median())
# print(X[numerical_cols].std())

In [12]:
# Normalize numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [None]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check if a pre-trained model exists
model_filename = "models/html_tag_model.pkl"
scaler_filename = "models/scaler.pkl"
tag_encoder_filename = "models/tag_encoder.pkl"
label_encoders_filename = "models/label_encoders.pkl"

try:
    model = joblib.load(model_filename)
    print("Loaded pre-trained model.")
except FileNotFoundError:
    print("No pre-trained model found, training a new one.")
    model = lgb.LGBMClassifier(
        n_estimators=200, 
        learning_rate=0.1, 
        random_state=42,
        class_weight='balanced',  # Helps to address the class imbalance
        force_col_wise=True
    )
    model.fit(X_train, y_train)

    # Save model and encoders
    joblib.dump(model, model_filename)
    joblib.dump(scaler, scaler_filename)
    joblib.dump(tag_encoder, tag_encoder_filename)
    joblib.dump(label_encoders, label_encoders_filename)
    print("Model saved for future use.")

In [None]:
# Predict and evaluate model performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred, labels=np.unique(y_test), target_names=tag_encoder.inverse_transform(np.unique(y_test))))

In [None]:
import dask.dataframe as dd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
import numpy as np
import pandas as pd

# Define column categories
categorical_cols = ['type', 'characters', 'parent_tag', 'is_leaf', 'font_weight', 'color', 'contains_special_chars',
                    'background_color', 'border_color', 'shadow_color', 'contains_number', 'text_type', 'visibility', 
                    'border_type', 'border_pattern', 'shadow_type', 'shadow_offset']
numerical_cols = ['x', 'y', 'width', 'height', 'depth', 'num_children', 'sibling_count', 'font_size', 'border_radius',  
                  'border_opacity', 'border_weight', 'shadow_radius', 'text_length', 'word_count',  'x_normalized', 
                  'y_normalized', 'x_center', 'y_center', 'x_quarter', 'y_quarter', 'aspect_ratio', 'area',
                  'normalized_width', 'normalized_height']

# Function to load and preprocess data in chunks using Dask
def load_and_preprocess_in_chunks(file_path, chunk_size=100000):
    # Load the dataset in chunks with dtypes specified
    ddf = dd.read_csv(file_path, encoding='latin1')
    
    # Filter the columns we need
    ddf = ddf[categorical_cols + numerical_cols + ['tag']]
    
    # Convert categorical columns to string and then to category type
    for col in categorical_cols:
        ddf[col] = ddf[col].astype('str')  # Ensure that categorical columns are treated as strings
        ddf[col] = ddf[col].astype('category')  # Convert to categorical
        ddf[col] = ddf[col].cat.as_known()  # Set known categories explicitly
    
    # Preprocess categorical columns using one-hot encoding
    ddf_categorical = dd.get_dummies(ddf[categorical_cols], dummy_na=True)
    
    # Normalize numerical columns (fit the scaler on the entire data for correct mean/std)
    scaler = StandardScaler()
    ddf_numerical = ddf[numerical_cols].map_partitions(scaler.fit_transform)
    
    # Combine categorical and numerical features
    X_processed = dd.concat([ddf_categorical, ddf_numerical], axis=1)
    
    # Encode the target column
    tag_encoder = LabelEncoder()
    y_encoded = tag_encoder.fit_transform(ddf['tag'])
    
    # Compute the Dask DataFrame and return the result
    X_processed = X_processed.compute()
    y_encoded = y_encoded.compute()

    # Train-test split
    X_train, X_val, y_train, y_val = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)
    
    return X_train, X_val, y_train, y_val, tag_encoder


# Create and compile model
def create_model(input_shape, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=input_shape),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to evaluate the model
def evaluate_model(model, X_test, y_test, tag_encoder):
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    
    accuracy = accuracy_score(y_test, y_pred_classes)
    print(f'Accuracy: {accuracy:.4f}')
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred_classes, labels=np.unique(y_test),
                                target_names=tag_encoder.inverse_transform(np.unique(y_test))))

# Train and evaluate model
def train_and_evaluate_model(file_path):
    # Load and preprocess the data
    X_train, X_val, y_train, y_val, tag_encoder = load_and_preprocess_in_chunks(file_path)
    
    # Create the model
    num_classes = len(tag_encoder.classes_)
    model = create_model(input_shape=(X_train.shape[1],), num_classes=num_classes)
    
    # Train the model
    model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))
    
    # Save the model
    model.save('tag_predictor_model.h5')
    print("Model saved as 'tag_predictor_model.h5'")
    
    # Evaluate the model on the validation set
    evaluate_model(model, X_val, y_val, tag_encoder)
    
    return model, tag_encoder

In [None]:
train_and_evaluate_model('figma_dataset3.csv')