In [None]:
import os
import gc
import psutil
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import h5py
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import ast
import warnings
from torch.cuda.amp import autocast, GradScaler

warnings.filterwarnings('ignore')

# Memory monitoring function
def print_memory_usage():
    process = psutil.Process(os.getpid())
    memory_info = process.memory_info()
    memory_gb = memory_info.rss / 1024**3
    print(f"Current memory usage: {memory_gb:.2f} GB")
    return memory_gb

# Set memory limit and device
MEMORY_LIMIT_GB = 7.5  # Leave some buffer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print_memory_usage()

# Force garbage collection
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
# Initialize GradScaler for mixed precision
scaler = GradScaler()

In [None]:
class MemoryEfficientFigmaDataset(Dataset):
    """Memory-efficient dataset for Figma node sequences."""
    
    def __init__(self, data_path, sequence_ids, label_encoder, load_in_memory=True):
        self.data_path = data_path
        self.sequence_ids = sequence_ids
        self.label_encoder = label_encoder
        self.load_in_memory = load_in_memory
        
        if load_in_memory:
            self._load_sequences()
        else:
            self.sequences = None
    
    def _load_sequences(self):
        """Load sequences into memory."""
        print("Loading sequences into memory...")
        
        if self.data_path.endswith('.csv'):
            df = pd.read_csv(self.data_path)
            df['feature_vector'] = df['feature_vector'].apply(ast.literal_eval).apply(np.array)
        elif self.data_path.endswith(('.hdf5', '.h5')):
            with h5py.File(self.data_path, 'r') as f:
                feature_vectors = f['feature_vector'][:]
                tags = [s.decode('utf-8') for s in f['tag'][:]]
                df = pd.DataFrame({
                    'feature_vector': list(feature_vectors),
                    'tag': tags
                })
        
        sequences = []
        current_sequence = []
        for _, row in df.iterrows():
            current_sequence.append(row)
            if row['tag'] == 'E_WEBSITE':
                sequences.append(current_sequence)
                current_sequence = []
        
        if current_sequence:
            sequences.append(current_sequence)
        
        self.sequences = {}
        for i, seq_id in enumerate(self.sequence_ids):
            if i < len(sequences):
                seq = sequences[i]
                feature_vectors = np.stack([row['feature_vector'] for row in seq])
                tags = [row['tag'] for row in seq]
                labels = self.label_encoder.transform(tags)
                
                self.sequences[seq_id] = {
                    'features': torch.FloatTensor(feature_vectors),
                    'labels': torch.LongTensor(labels)
                }
        
        del df, sequences
        gc.collect()
        print(f"Loaded {len(self.sequences)} sequences")
        print_memory_usage()
    
    def __len__(self):
        return len(self.sequence_ids)
    
    def __getitem__(self, idx):
        seq_id = self.sequence_ids[idx]
        
        if self.load_in_memory:
            return {
                'features': self.sequences[seq_id]['features'],
                'labels': self.sequences[seq_id]['labels'],
                'seq_id': seq_id
            }
        else:
            raise NotImplementedError("Lazy loading not implemented in this example")

def memory_efficient_collate_fn(batch):
    """Memory-efficient collate function."""
    features = [item['features'] for item in batch]
    labels = [item['labels'] for item in batch]
    seq_ids = [item['seq_id'] for item in batch]
    
    lengths = torch.tensor([len(f) for f in features])
    max_len = max(lengths)
    
    MAX_SEQ_LENGTH = 500  # Adjust based on sequence length analysis
    if max_len > MAX_SEQ_LENGTH:
        max_len = MAX_SEQ_LENGTH
        print(f"Warning: Truncating sequences to {MAX_SEQ_LENGTH} tokens")
    
    feature_dim = features[0].shape[1]
    
    padded_features = torch.zeros((len(batch), max_len, feature_dim), dtype=torch.float32)
    padded_labels = torch.full((len(batch), max_len), -100, dtype=torch.long)
    
    for i, (f, l) in enumerate(zip(features, labels)):
        seq_len = min(f.shape[0], max_len)
        padded_features[i, :seq_len] = f[:seq_len]
        padded_labels[i, :seq_len] = l[:seq_len]
        lengths[i] = seq_len
    
    return {
        'features': padded_features,
        'labels': padded_labels,
        'lengths': lengths,
        'seq_ids': seq_ids
    }

print("Memory-efficient dataset classes defined")
print_memory_usage()

In [None]:
class OptimizedFigmaBLSTM(nn.Module):
    """Memory-optimized Bidirectional LSTM model."""
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.3):
        super(OptimizedFigmaBLSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_dim, 
            hidden_dim, 
            num_layers=num_layers,
            batch_first=True, 
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.attention = None  # Disabled for memory efficiency
        
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, lengths):
        batch_size, seq_len, _ = x.size()
        
        packed_input = nn.utils.rnn.pack_padded_sequence(
            x, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        
        packed_output, _ = self.lstm(packed_input)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        
        output = self.dropout(output)
        
        if output.size(0) > 1:
            output_reshaped = output.reshape(-1, output.size(-1))
            output_reshaped = self.batch_norm(output_reshaped)
            output = output_reshaped.reshape(batch_size, seq_len, -1)
        
        logits = self.fc(output)
        
        return logits

print("Optimized model architecture defined")
print_memory_usage()

In [None]:
def load_and_split_data(data_path, test_size=0.2, random_state=42, max_sequences=None):
    """Load and split data with memory optimization."""
    print(f"Loading data from {data_path}...")
    
    if data_path.endswith('.csv'):
        chunk_size = 10000
        chunks = []
        for chunk in pd.read_csv(data_path, chunksize=chunk_size):
            chunk['feature_vector'] = chunk['feature_vector'].apply(ast.literal_eval).apply(np.array)
            chunks.append(chunk)
        df = pd.concat(chunks, ignore_index=True)
        del chunks
        gc.collect()
    elif data_path.endswith(('.hdf5', '.h5')):
        with h5py.File(data_path, 'r') as f:
            feature_vectors = f['feature_vector'][:]
            tags = [s.decode('utf-8') for s in f['tag'][:]]
            df = pd.DataFrame({
                'feature_vector': list(feature_vectors),
                'tag': tags
            })
    
    print(f"Loaded {len(df)} records")
    print_memory_usage()
    
    sequences = []
    current_sequence = []
    for _, row in df.iterrows():
        current_sequence.append(row)
        if row['tag'] == 'E_WEBSITE':
            sequences.append(current_sequence)
            current_sequence = []
            if max_sequences and len(sequences) >= max_sequences:
                break
    
    if current_sequence:
        sequences.append(current_sequence)
    
    print(f"Found {len(sequences)} sequences")
    
    if max_sequences:
        sequences = sequences[:max_sequences]
        print(f"Limited to {len(sequences)} sequences for memory efficiency")
    
    all_tags = df['tag'].unique()
    label_encoder = LabelEncoder()
    label_encoder.fit(all_tags)
    print(f"Found {len(label_encoder.classes_)} unique tags")
    
    sequence_ids = [f'seq_{i}' for i in range(len(sequences))]
    
    train_seq_ids, val_seq_ids = train_test_split(
        sequence_ids, test_size=test_size, random_state=random_state
    )
    
    print(f"Split: {len(train_seq_ids)} train, {len(val_seq_ids)} validation sequences")
    
    first_seq = sequences[0]
    input_dim = first_seq[0]['feature_vector'].shape[0]
    print(f"Input feature dimension: {input_dim}")
    
    del df, sequences
    gc.collect()
    print_memory_usage()
    
    return train_seq_ids, val_seq_ids, label_encoder, len(label_encoder.classes_), input_dim

DATA_PATH = "figma_dataset_custom.h5"  # Update this path
MAX_SEQUENCES = 1000  # Adjust as needed

train_seq_ids, val_seq_ids, label_encoder, num_classes, input_dim = load_and_split_data(
    DATA_PATH, max_sequences=MAX_SEQUENCES
)

print(f"Data loading completed. Memory usage:")
print_memory_usage()

In [None]:
model_config = {
    'hidden_dim': 128,
    'num_layers': 1,
    'dropout': 0.3,
    'learning_rate': 0.001,
    'batch_size': 8,
    'epochs': 10
}

print("Creating datasets and data loaders...")

train_dataset = MemoryEfficientFigmaDataset(DATA_PATH, train_seq_ids, label_encoder)
val_dataset = MemoryEfficientFigmaDataset(DATA_PATH, val_seq_ids, label_encoder)

train_loader = DataLoader(
    train_dataset,
    batch_size=model_config['batch_size'],
    shuffle=True,
    collate_fn=memory_efficient_collate_fn,
    num_workers=0,
    pin_memory=False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=model_config['batch_size'],
    shuffle=False,
    collate_fn=memory_efficient_collate_fn,
    num_workers=0,
    pin_memory=False
)

print(f"Data loaders created:")
print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print_memory_usage()

In [None]:
print("\nTesting data loader...")
try:
    test_batch = next(iter(train_loader))
    print(f"Batch features shape: {test_batch['features'].shape}")
    print(f"Batch labels shape: {test_batch['labels'].shape}")
    print("Data loader test successful!")
    del test_batch
    gc.collect()
except Exception as e:
    print(f"Data loader test failed: {e}")
    
print_memory_usage()

In [None]:
print("Building model...")
model = OptimizedFigmaBLSTM(
    input_dim=input_dim,
    hidden_dim=model_config['hidden_dim'],
    output_dim=num_classes,
    num_layers=model_config['num_layers'],
    dropout=model_config['dropout']
)

model = model.to(device)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model built:")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print_memory_usage()

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = optim.Adam(model.parameters(), lr=model_config['learning_rate'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

best_val_loss = float('inf')
history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
output_dir = './models'
os.makedirs(output_dir, exist_ok=True)

print("Training setup completed")
print_memory_usage()

In [None]:
def safe_training_step(model, batch, criterion, optimizer, device):
    """Safe training step with mixed precision."""
    try:
        features = batch['features'].to(device)
        labels = batch['labels'].to(device)
        lengths = batch['lengths']
        
        optimizer.zero_grad()
        
        with autocast():
            outputs = model(features, lengths)
            batch_size, seq_len, num_classes = outputs.size()
            outputs = outputs.reshape(-1, num_classes)
            labels = labels.reshape(-1)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        del features, labels, outputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return loss.item()
    
    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"WARNING: Out of memory error occurred: {e}")
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            return None
        else:
            raise e

print("Starting training...")
print("=" * 50)

for epoch in range(model_config['epochs']):
    mem_usage = print_memory_usage()
    if mem_usage > MEMORY_LIMIT_GB:
        print(f"WARNING: Memory usage ({mem_usage:.2f} GB) approaching limit!")
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    model.train()
    train_loss = 0.0
    train_batches = 0
    successful_batches = 0
    
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{model_config['epochs']} [Train]")
    
    for batch_idx, batch in enumerate(progress_bar):
        loss = safe_training_step(model, batch, criterion, optimizer, device)
        
        if loss is not None:
            train_loss += loss
            successful_batches += 1
            progress_bar.set_postfix({'loss': loss, 'mem': f'{print_memory_usage():.1f}GB'})
        else:
            print(f"Skipped batch {batch_idx} due to memory error")
        
        train_batches += 1
        
        if batch_idx % 10 == 0:
            gc.collect()
    
    if successful_batches > 0:
        avg_train_loss = train_loss / successful_batches
        history['train_loss'].append(avg_train_loss)
        print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f} ({successful_batches}/{train_batches} successful batches)")
    else:
        print(f"Epoch {epoch+1} - No successful training batches!")
        break
    
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def safe_validation_step(model, batch, criterion, device):
    """Safe validation step with mixed precision."""
    try:
        with torch.no_grad():
            features = batch['features'].to(device)
            labels = batch['labels'].to(device)
            lengths = batch['lengths']
            
            with autocast():
                outputs = model(features, lengths)
                batch_size, seq_len, num_classes = outputs.size()
                outputs_flat = outputs.reshape(-1, num_classes)
                labels_flat = labels.reshape(-1)
                loss = criterion(outputs_flat, labels_flat)
            
            mask = (labels_flat != -100)
            correct = 0
            total = 0
            
            if mask.sum() > 0:
                predicted = torch.argmax(outputs_flat[mask], dim=1)
                correct = (predicted == labels_flat[mask]).sum().item()
                total = mask.sum().item()
            
            del features, labels, outputs, outputs_flat, labels_flat
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            return loss.item(), correct, total
    
    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"WARNING: Validation OOM error: {e}")
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            return None, 0, 0
        else:
            raise e

    model.eval()
    val_loss = 0.0
    val_batches = 0
    successful_val_batches = 0
    total_correct = 0
    total_samples = 0
    
    progress_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{model_config['epochs']} [Val]")
    
    for batch_idx, batch in enumerate(progress_bar):
        loss, correct, samples = safe_validation_step(model, batch, criterion, device)
        
        if loss is not None:
            val_loss += loss
            total_correct += correct
            total_samples += samples
            successful_val_batches += 1
            
            current_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0
            progress_bar.set_postfix({
                'loss': loss, 
                'acc': f'{current_acc:.1f}%',
                'mem': f'{print_memory_usage():.1f}GB'
            })
        
        val_batches += 1
        
        if batch_idx % 5 == 0:
            gc.collect()
    
    if successful_val_batches > 0:
        avg_val_loss = val_loss / successful_val_batches
        val_accuracy = total_correct / total_samples if total_samples > 0 else 0
        
        history['val_loss'].append(avg_val_loss)
        history['val_accuracy'].append(val_accuracy)
        
        print(f"Epoch {epoch+1} - Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model_path = os.path.join(output_dir, "figma_blstm_model.pt")
            
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': best_val_loss,
                'label_encoder': label_encoder,
                'model_config': model_config,
                'input_dim': input_dim,
                'num_classes': num_classes
            }, model_path)
            print(f"  Best model saved to {model_path}")
        
        scheduler.step(avg_val_loss)
    
    else:
        print(f"Epoch {epoch+1} - No successful validation batches!")
    
    print("-" * 50)
    
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("Training completed!")
print("=" * 50)
print_memory_usage()

In [None]:
def load_trained_model(model_path, device):
    """Load the trained model safely."""
    print(f"Loading model from {model_path}...")
    
    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    
    model_config_loaded = checkpoint['model_config']
    input_dim = checkpoint['input_dim']
    num_classes = checkpoint['num_classes']
    label_encoder = checkpoint['label_encoder']
    
    model = OptimizedFigmaBLSTM(
        input_dim=input_dim,
        hidden_dim=model_config_loaded['hidden_dim'],
        output_dim=num_classes,
        num_layers=model_config_loaded['num_layers'],
        dropout=model_config_loaded['dropout']
    )
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model = model.to(device)
    model.eval()
    
    print("Model loaded successfully!")
    return model, label_encoder

def predict_batch_safe(model, data_loader, label_encoder, device, max_batches=None):
    """Safe batch prediction with robust handling."""
    results = {}
    model.eval()
    processed_batches = 0
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(data_loader, desc="Predicting")):
            try:
                features = batch['features'].to(device)
                labels = batch['labels']
                lengths = batch['lengths']
                seq_ids = batch['seq_ids']
                
                outputs = model(features, lengths)
                predictions = torch.argmax(outputs, dim=2)
                
                for i, seq_id in enumerate(seq_ids):
                    seq_len = lengths[i].item()
                    pred_indices = predictions[i, :seq_len].cpu().numpy()
                    true_indices = labels[i, :seq_len].cpu().numpy()
                    
                    pred_tags = []
                    for idx in pred_indices:
                        if 0 <= idx < len(label_encoder.classes_):
                            pred_tags.append(label_encoder.classes_[idx])
                        else:
                            pred_tags.append("UNKNOWN")
                    
                    true_tags = [label_encoder.classes_[idx] if idx != -100 and 0 <= idx < len(label_encoder.classes_) else "UNKNOWN" for idx in true_indices]
                    
                    results[seq_id] = {
                        'predicted_tags': pred_tags,
                        'true_tags': true_tags
                    }
                
                del features, outputs, predictions
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
                processed_batches += 1
                if max_batches and processed_batches >= max_batches:
                    break
                    
            except RuntimeError as e:
                if "out of memory" in str(e):
                    print(f"Skipping batch {batch_idx} due to memory error")
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    continue
                else:
                    raise e
            
            if batch_idx % 5 == 0:
                gc.collect()
    
    return results

try:
    model_path = os.path.join(output_dir, "figma_blstm_model.pt")
    trained_model, trained_label_encoder = load_trained_model(model_path, device)
    
    print("Making predictions on validation data...")
    results = predict_batch_safe(
        trained_model, val_loader, trained_label_encoder, device, max_batches=50
    )
    
    print(f"Generated predictions for {len(results)} sequences")
    print_memory_usage()
    
except FileNotFoundError:
    print("No trained model found. Please run the training cells first.")
    results = {}
except Exception as e:
    print(f"Error during prediction: {e}")
    results = {}

def analyze_results(results, max_sequences_to_show=3):
    """Analyze and display results."""
    if not results:
        print("No results to analyze.")
        return
    
    print("=" * 50)
    print("EVALUATION RESULTS")
    print("=" * 50)
    
    all_true_tags = []
    all_pred_tags = []
    correct_predictions = 0
    total_predictions = 0
    sequence_accuracies = []
    
    for seq_id, seq_results in results.items():
        true_tags = seq_results['true_tags']
        pred_tags = seq_results['predicted_tags']
        
        all_true_tags.extend(true_tags)
        all_pred_tags.extend(pred_tags)
        
        seq_correct = sum(1 for t, p in zip(true_tags, pred_tags) if t == p)
        seq_total = len(true_tags)
        seq_accuracy = seq_correct / seq_total if seq_total > 0 else 0
        sequence_accuracies.append(seq_accuracy)
        
        correct_predictions += seq_correct
        total_predictions += seq_total
    
    overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    avg_sequence_accuracy = sum(sequence_accuracies) / len(sequence_accuracies) if sequence_accuracies else 0
    
    print(f"Overall Accuracy: {overall_accuracy:.4f} ({overall_accuracy*100:.2f}%)")
    print(f"Average Sequence Accuracy: {avg_sequence_accuracy:.4f} ({avg_sequence_accuracy*100:.2f}%)")
    print(f"Total Predictions: {total_predictions}")
    print(f"Correct Predictions: {correct_predictions}")
    print(f"Number of Sequences: {len(results)}")
    
    print("\n" + "-" * 30)
    print("PER-CLASS ACCURACY")
    print("-" * 30)
    
    class_counts = {}
    class_correct = {}
    
    for true_tag, pred_tag in zip(all_true_tags, all_pred_tags):
        class_counts[true_tag] = class_counts.get(true_tag, 0) + 1
        if true_tag == pred_tag:
            class_correct[true_tag] = class_correct.get(true_tag, 0) + 1
    
    for class_name in sorted(class_counts.keys()):
        correct = class_correct.get(class_name, 0)
        total = class_counts[class_name]
        accuracy = correct / total if total > 0 else 0
        print(f"{class_name:>15}: {accuracy:.4f} ({accuracy*100:.2f}%) - {correct}/{total}")
    
    print("\n" + "=" * 50)
    print("SAMPLE PREDICTIONS")
    print("=" * 50)
    
    sample_count = 0
    for seq_id, seq_results in results.items():
        if sample_count >= max_sequences_to_show:
            break
            
        true_tags = seq_results['true_tags']
        pred_tags = seq_results['predicted_tags']
        
        seq_correct = sum(1 for t, p in zip(true_tags, pred_tags) if t == p)
        seq_accuracy = seq_correct / len(true_tags) if true_tags else 0
        
        print(f"\nSequence {sample_count + 1} (ID: {seq_id}):")
        print(f"Sequence Accuracy: {seq_accuracy:.4f} ({seq_accuracy*100:.2f}%)")
        print(f"Correct/Total: {seq_correct}/{len(true_tags)}")
        
        print("Predictions:")
        for j in range(min(10, len(true_tags))):
            correct_indicator = "✓" if true_tags[j] == pred_tags[j] else "✗"
            print(f"  {correct_indicator} Node {j+1}: True={true_tags[j]}, Predicted={pred_tags[j]}")
        
        if len(true_tags) > 10:
            print(f"  ... and {len(true_tags) - 10} more predictions")
            
        sample_count += 1
    
    print("\n" + "="*50)
    print("MODEL TRAINING AND EVALUATION COMPLETED!")
    print("="*50)

analyze_results(results, max_sequences_to_show=3)
print_memory_usage()

In [None]:
def analyze_sequence_lengths(data_path, max_sequences=None):
    """Analyze sequence lengths to optimize MAX_SEQ_LENGTH."""
    print(f"Analyzing sequence lengths from {data_path}...")
    
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
        df['feature_vector'] = df['feature_vector'].apply(ast.literal_eval).apply(np.array)
    elif data_path.endswith(('.hdf5', '.h5')):
        with h5py.File(data_path, 'r') as f:
            feature_vectors = f['feature_vector'][:]
            tags = [s.decode('utf-8') for s in f['tag'][:]]
            df = pd.DataFrame({
                'feature_vector': list(feature_vectors),
                'tag': tags
            })
    
    sequences = []
    current_sequence = []
    for _, row in df.iterrows():
        current_sequence.append(row)
        if row['tag'] == 'E_WEBSITE':
            sequences.append(current_sequence)
            current_sequence = []
            if max_sequences and len(sequences) >= max_sequences:
                break
    
    if current_sequence:
        sequences.append(current_sequence)
    
    sequence_lengths = [len(seq) for seq in sequences]
    
    print(f"Number of sequences: {len(sequence_lengths)}")
    print(f"Min sequence length: {min(sequence_lengths)}")
    print(f"Max sequence length: {max(sequence_lengths)}")
    print(f"Mean sequence length: {np.mean(sequence_lengths):.2f}")
    print(f"Median sequence length: {np.median(sequence_lengths)}")
    print(f"95th percentile: {np.percentile(sequence_lengths, 95)}")
    print(f"99th percentile: {np.percentile(sequence_lengths, 99)}")
    
    del df, sequences
    gc.collect()
    
# Uncomment to analyze sequence lengths and adjust MAX_SEQ_LENGTH
analyze_sequence_lengths(DATA_PATH, max_sequences=1000)