In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from sklearn.model_selection import KFold
from rouge import Rouge
import gc
import os
import warnings
import json  # Added for saving results

# Try to import psutil or provide alternative
try:
    import psutil
    HAS_PSUTIL = True
except ImportError:
    print("Warning: psutil not installed. Basic memory monitoring will be used instead.")
    HAS_PSUTIL = False

# Hyperparameters
BATCH_SIZE = 8
EMBEDDING_DIM = 768
LSTM_UNITS = 128
DENSE_UNITS = 64
DROPOUT_RATE = 0.3
LEARNING_RATE = 1e-4
EPOCHS = 3
K_FOLDS = 5
MAX_TITLE_LEN = 30
MAX_SEQ_LEN = 512

# Track and report memory usage - PLACED AT THE BEGINNING
def get_memory_usage():
    if HAS_PSUTIL:
        process = psutil.Process(os.getpid())
        memory_info = process.memory_info()
        return memory_info.rss / (1024 * 1024)  # Convert to MB
    else:
        return 0

def print_memory_usage(label="Current"):
    if HAS_PSUTIL:
        print(f"{label} memory usage: {get_memory_usage():.2f} MB")
    else:
        print(f"{label} memory usage: Not available (psutil not installed)")

# Memory cleanup function
def clean_memory():
    gc.collect()
    tf.keras.backend.clear_session()
    print_memory_usage("After cleanup")

# Memory cleanup callback
class MemoryCleanupCallback(tf.keras.callbacks.Callback):
    def __init__(self, memory_threshold_mb=None):
        super().__init__()
        self.memory_threshold_mb = memory_threshold_mb
        
    def on_epoch_end(self, epoch, logs=None):
        if HAS_PSUTIL:
            current_memory = get_memory_usage()
            print(f"Memory usage after epoch {epoch+1}: {current_memory:.2f} MB")
            
            if self.memory_threshold_mb and current_memory > self.memory_threshold_mb:
                print(f"Memory usage ({current_memory:.2f} MB) exceeded threshold ({self.memory_threshold_mb} MB), cleaning up...")
                clean_memory()
        
        gc.collect()

# Multi-head attention wrapper
class MultiHeadAttentionWrapper(layers.Layer):
    def __init__(self, num_heads=4, key_dim=64):
        super().__init__()
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=key_dim,
            dropout=0.1
        )

    def call(self, query, value):
        if len(query.shape) == 2:
            query = tf.expand_dims(query, axis=1)
        return self.attention(query, value)

# Build the model
def build_model(max_seq_len, max_title_len, low_memory=False):
    lstm_units = LSTM_UNITS // 2 if low_memory else LSTM_UNITS
    dense_units = DENSE_UNITS // 2 if low_memory else DENSE_UNITS
    
    # Encoder
    encoder_inputs = layers.Input(shape=(max_seq_len, EMBEDDING_DIM))
    x = layers.LayerNormalization()(encoder_inputs)

    encoder_lstm = layers.Bidirectional(
        layers.LSTM(
            lstm_units,
            return_sequences=True,
            return_state=True,
            dropout=0.1,
            kernel_regularizer=tf.keras.regularizers.l2(0.01)
        )
    )
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(x)
    state_h = layers.Concatenate()([forward_h, backward_h])

    # Attention
    if not low_memory:
        attention = MultiHeadAttentionWrapper(num_heads=2, key_dim=32)
        context_vector = attention(state_h, encoder_outputs)
        context_vector = layers.Lambda(
            lambda inputs: inputs[0] + tf.expand_dims(inputs[1], axis=1)
        )([context_vector, state_h])
        context_vector = layers.LayerNormalization()(context_vector)
    else:
        context_vector = layers.Lambda(lambda x: tf.expand_dims(x, axis=1))(state_h)

    # Decoder
    decoder_inputs = layers.Input(shape=(max_title_len-1, EMBEDDING_DIM))
    y = layers.LayerNormalization()(decoder_inputs)

    decoder_lstm = layers.LSTM(
        lstm_units * 2,
        return_sequences=True,
        dropout=0.1,
        kernel_regularizer=tf.keras.regularizers.l2(0.01)
    )
    
    context_squeezed = layers.Lambda(lambda x: tf.squeeze(x, axis=1))(context_vector)
    zeros_tensor = layers.Lambda(lambda x: tf.zeros_like(x))(context_squeezed)
    
    decoder_outputs = decoder_lstm(
        y,
        initial_state=[context_squeezed, zeros_tensor]
    )

    decoder_outputs = layers.Dense(dense_units, activation='relu')(decoder_outputs)
    decoder_outputs = layers.Dropout(DROPOUT_RATE)(decoder_outputs)
    outputs = layers.Dense(EMBEDDING_DIM, activation='linear')(decoder_outputs)
    
    if not low_memory:
        outputs = layers.Add()([outputs, decoder_inputs])

    return models.Model([encoder_inputs, decoder_inputs], outputs)

# Create TensorFlow datasets
def create_tf_dataset(text_data, title_data, batch_size, buffer_size=1000):
    encoder_inputs = text_data
    decoder_inputs = title_data[:, :-1, :]
    decoder_targets = title_data[:, 1:, :]
    
    dataset = tf.data.Dataset.from_tensor_slices(
        ((encoder_inputs, decoder_inputs), decoder_targets)
    )
    
    actual_buffer = min(buffer_size, len(encoder_inputs))
    dataset = dataset.shuffle(buffer_size=actual_buffer)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

# Convert embeddings to text
def embeddings_to_text(embeddings, vocab_data, special_tokens=None, batch_size=50):
    if special_tokens is None:
        special_tokens = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3}
    
    vocab_embeddings, vocab_tokens = vocab_data
    
    results = []
    for i in range(0, len(embeddings), batch_size):
        batch = embeddings[i:i+batch_size]
        
        norm_vocab = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
        norm_embs = batch / np.linalg.norm(batch, axis=1, keepdims=True)
        
        similarities = np.dot(norm_embs, norm_vocab.T)
        nearest_indices = np.argmax(similarities, axis=1)
        
        tokens = [vocab_tokens[idx] for idx in nearest_indices]
        tokens = [t for t in tokens if t not in special_tokens.keys()]
        
        text = ' '.join(tokens)
        results.append(text)
        
        del similarities, nearest_indices, norm_embs
        gc.collect()
    
    return ' '.join(results)

# Modified evaluate_rouge_scores function with improved logging
def evaluate_rouge_scores(model, data_files, vocab_data, indices=None, num_samples=50, prefix=""):
    # Skip ROUGE evaluation if vocab_data is None
    if vocab_data is None:
        print(f"\n{prefix} ROUGE evaluation skipped - vocabulary files not available")
        # Return empty scores
        return {
            'rouge-1': {'f': 0, 'p': 0, 'r': 0},
            'rouge-2': {'f': 0, 'p': 0, 'r': 0}
        }, []
    
    text_file, title_file = data_files
    vocab_embeddings, vocab_tokens = vocab_data
    
    rouge = Rouge(metrics=['rouge-1', 'rouge-2'])
    
    if indices is None:
        total_samples = sum(1 for _ in np.load(text_file, mmap_mode='r'))
        indices = np.random.choice(total_samples, num_samples, replace=False)
    elif len(indices) > num_samples:
        indices = np.random.choice(indices, num_samples, replace=False)
    
    print(f"\n{prefix} Evaluating ROUGE-1 and ROUGE-2 scores on {len(indices)} samples...")
    
    all_scores = []
    examples = []
    
    text_data = np.load(text_file, mmap_mode='r')
    title_data = np.load(title_file, mmap_mode='r')
    
    batch_size = 5
    for i in range(0, len(indices), batch_size):
        print(f"Processing samples {i}-{min(i+batch_size, len(indices))}/{len(indices)}")
        
        batch_indices = indices[i:i+batch_size]
        batch_text = np.array(text_data[batch_indices])
        batch_titles = np.array(title_data[batch_indices])
        
        batch_titles = batch_titles[:, :MAX_TITLE_LEN, :]
        
        for j in range(len(batch_indices)):
            input_text = batch_text[j:j+1]
            decoder_input = np.zeros((1, MAX_TITLE_LEN-1, EMBEDDING_DIM))
            
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    generated_emb = model.predict([input_text, decoder_input], verbose=0)
                
                reference = embeddings_to_text(batch_titles[j], (vocab_embeddings, vocab_tokens))
                generated = embeddings_to_text(generated_emb[0], (vocab_embeddings, vocab_tokens))
                
                if len(generated.strip()) > 0 and len(reference.strip()) > 0:
                    try:
                        scores = rouge.get_scores(generated, reference)[0]
                        all_scores.append(scores)
                        
                        if len(examples) < 5:
                            examples.append({
                                'reference': reference,
                                'generated': generated,
                                'scores': scores
                            })
                    except Exception as e:
                        print(f"Error calculating ROUGE: {e}")
            except Exception as e:
                print(f"Error in prediction: {e}")
        
        del batch_text, batch_titles
        clean_memory()
    
    if not all_scores:
        print("Warning: No valid ROUGE scores calculated!")
        return {
            'rouge-1': {'f': 0, 'p': 0, 'r': 0},
            'rouge-2': {'f': 0, 'p': 0, 'r': 0}
        }, []
    
    avg_scores = {
        'rouge-1': {
            'f': np.mean([s['rouge-1']['f'] for s in all_scores]),
            'p': np.mean([s['rouge-1']['p'] for s in all_scores]),
            'r': np.mean([s['rouge-1']['r'] for s in all_scores])
        },
        'rouge-2': {
            'f': np.mean([s['rouge-2']['f'] for s in all_scores]),
            'p': np.mean([s['rouge-2']['p'] for s in all_scores]),
            'r': np.mean([s['rouge-2']['r'] for s in all_scores])
        }
    }
    
    # Print in a very clear, obvious way
    print("\n" + "="*50)
    print(f"{prefix} ROUGE SCORE RESULTS:")
    print("="*50)
    print(f"ROUGE-1 F1: {avg_scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-1 Precision: {avg_scores['rouge-1']['p']:.4f}")
    print(f"ROUGE-1 Recall: {avg_scores['rouge-1']['r']:.4f}")
    print("-"*50)
    print(f"ROUGE-2 F1: {avg_scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-2 Precision: {avg_scores['rouge-2']['p']:.4f}")
    print(f"ROUGE-2 Recall: {avg_scores['rouge-2']['r']:.4f}")
    print("="*50)
    
    print("\nExample Generations:")
    for i, example in enumerate(examples):
        print(f"\nExample {i+1}:")
        print(f"Reference: {example['reference']}")
        print(f"Generated: {example['generated']}")
        print(f"ROUGE-1 F1: {example['scores']['rouge-1']['f']:.4f}")
        print(f"ROUGE-2 F1: {example['scores']['rouge-2']['f']:.4f}")
    
    return avg_scores, examples

# Modified training function with improved ROUGE tracking
def train_with_memory_monitoring(memory_threshold_mb=8000):
    print("Starting training with memory monitoring")
    print_memory_usage("Initial")
    
    use_low_memory_mode = False
    if HAS_PSUTIL:
        total_memory = psutil.virtual_memory().total / (1024 * 1024)
        use_low_memory_mode = total_memory < 16000
    
    if use_low_memory_mode:
        print("LOW MEMORY MODE ENABLED: Using simplified model architecture")
        global BATCH_SIZE
        BATCH_SIZE = 4
    
    # Dictionary to store all ROUGE scores
    all_rouge_scores = {
        "epochs": {},
        "folds": {}
    }
    
    try:
        text_file = 'text_embeddings.npy'
        title_file = 'title_embeddings.npy'
        
        text_size_mb = os.path.getsize(text_file) / (1024 * 1024)
        title_size_mb = os.path.getsize(title_file) / (1024 * 1024)
        print(f"Text embeddings file size: {text_size_mb:.2f} MB")
        print(f"Title embeddings file size: {title_size_mb:.2f} MB")
        
        text_sample = np.load(text_file, mmap_mode='r')[:1]
        title_sample = np.load(title_file, mmap_mode='r')[:1]
        
        max_seq_len = text_sample.shape[1]
        embedding_dim = text_sample.shape[2]
        
        print(f"Text embedding dimensions: {text_sample.shape}")
        print(f"Title embedding dimensions: {title_sample.shape}")
        
        title_sample = title_sample[:, :MAX_TITLE_LEN, :]
        
        # Try to load vocabulary files, but allow training to continue without them
        vocab_data = None
        try:
            vocab_embeddings = np.load('vocab_embeddings.npy')
            vocab_tokens = np.load('vocab_tokens.npy', allow_pickle=True)
            print(f"Loaded vocabulary with {len(vocab_tokens)} tokens")
            vocab_data = (vocab_embeddings, vocab_tokens)
        except FileNotFoundError:
            print("WARNING: Vocabulary files not found. ROUGE evaluation will be skipped.")
            print("Please ensure that vocab_embeddings.npy and vocab_tokens.npy exist in the current directory.")
            print("These files are required for calculating ROUGE scores.")
            print("Training will continue without ROUGE evaluation.")
        
        # Create minimal vocabulary if not found (optional approach)
        if vocab_data is None and False:  # Set to True to enable this option
            print("Creating minimal vocabulary files for testing purposes")
            # Create a small vocab for testing
            test_vocab_size = 1000
            test_embedding_dim = EMBEDDING_DIM
            vocab_embeddings = np.random.randn(test_vocab_size, test_embedding_dim).astype(np.float32)
            vocab_tokens = np.array([f"token_{i}" for i in range(test_vocab_size)], dtype=object)
            
            # Save these files
            np.save('vocab_embeddings.npy', vocab_embeddings)
            np.save('vocab_tokens.npy', vocab_tokens)
            
            # Use them
            vocab_data = (vocab_embeddings, vocab_tokens)
            print(f"Created test vocabulary with {len(vocab_tokens)} tokens")
            
        total_samples = sum(1 for _ in np.load(text_file, mmap_mode='r'))
        print(f"Total samples available: {total_samples}")
        
        indices = np.arange(total_samples)
        
        kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
        
        fold_val_indices = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(indices)):
            print(f"\n{'='*50}")
            print(f"Fold {fold+1}/{K_FOLDS}")
            print(f"{'='*50}")
            print_memory_usage("Before fold training")
            
            fold_val_indices.append(val_idx)
            
            print(f"Training samples: {len(train_idx)}")
            print(f"Validation samples: {len(val_idx)}")
            
            model = build_model(max_seq_len, MAX_TITLE_LEN, low_memory=use_low_memory_mode)
            model.compile(
                optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
                loss='mse'
            )
            
            model.summary()
            
            # Dictionary to store epoch scores for this fold
            epoch_scores = {}
            
            chunk_size = 500
            
            for epoch in range(EPOCHS):
                print(f"\nEpoch {epoch+1}/{EPOCHS}")
                
                np.random.shuffle(train_idx)
                
                for chunk_start in range(0, len(train_idx), chunk_size):
                    chunk_end = min(chunk_start + chunk_size, len(train_idx))
                    chunk_indices = train_idx[chunk_start:chunk_end]
                    
                    print(f"Training on chunk {chunk_start//chunk_size + 1}/{len(train_idx)//chunk_size + 1} (samples {chunk_start}-{chunk_end-1})")
                    
                    X_train_chunk = np.array(np.load(text_file, mmap_mode='r')[chunk_indices])
                    y_train_chunk = np.array(np.load(title_file, mmap_mode='r')[chunk_indices, :MAX_TITLE_LEN, :])
                    
                    train_dataset = create_tf_dataset(X_train_chunk, y_train_chunk, BATCH_SIZE)
                    
                    model.fit(
                        train_dataset,
                        epochs=1,
                        verbose=1,
                        callbacks=[MemoryCleanupCallback(memory_threshold_mb)]
                    )
                    
                    del X_train_chunk, y_train_chunk, train_dataset
                    clean_memory()
                
                # Evaluate ROUGE after each epoch if vocabulary data is available
                print(f"\nEvaluating ROUGE scores for epoch {epoch+1}...")
                val_subset_idx = np.random.choice(val_idx, min(50, len(val_idx)), replace=False)
                epoch_rouge_scores, _ = evaluate_rouge_scores(
                    model,
                    (text_file, title_file),
                    vocab_data,  # This can be None, evaluate_rouge_scores will handle it
                    indices=val_subset_idx,
                    num_samples=min(20, len(val_subset_idx)),
                    prefix=f"[Fold {fold+1}, Epoch {epoch+1}]"
                )
                
                # Store the scores
                epoch_key = f"epoch_{epoch+1}"
                epoch_scores[epoch_key] = epoch_rouge_scores
                
                # Save scores to file after each epoch if ROUGE evaluation was performed
                if vocab_data is not None:
                    with open(f"rouge_scores_fold{fold+1}_epoch{epoch+1}.json", "w") as f:
                        json.dump(epoch_rouge_scores, f, indent=4)
                    print(f"ROUGE scores for Fold {fold+1}, Epoch {epoch+1} saved to rouge_scores_fold{fold+1}_epoch{epoch+1}.json")
            
            # Store all epoch scores for this fold
            all_rouge_scores["epochs"][f"fold_{fold+1}"] = epoch_scores
            
            try:
                model.save(f"nepali_headline_generator_fold_{fold+1}.keras")
                print(f"Model for fold {fold+1} saved successfully.")
            except Exception as e:
                print(f"Error saving model: {e}")
                try:
                    model.save(f"nepali_headline_generator_fold_{fold+1}.h5")
                    print(f"Model saved in h5 format instead.")
                except Exception as e2:
                    print(f"Could not save model in any format: {e2}")
            
            # Final evaluation with more samples if vocabulary data is available
            print(f"\nEvaluating FINAL ROUGE scores for fold {fold+1}...")
            fold_rouge_scores, examples = evaluate_rouge_scores(
                model,
                (text_file, title_file),
                vocab_data,  # This can be None, evaluate_rouge_scores will handle it
                indices=val_idx,
                num_samples=min(100, len(val_idx)),
                prefix=f"[FINAL FOLD {fold+1}]"
            )
            
            # Store the fold's final scores
            all_rouge_scores["folds"][f"fold_{fold+1}"] = fold_rouge_scores
            
            # Save fold scores to file if ROUGE evaluation was performed
            if vocab_data is not None:
                with open(f"rouge_scores_fold{fold+1}_final.json", "w") as f:
                    json.dump(fold_rouge_scores, f, indent=4)
                print(f"Final ROUGE scores for Fold {fold+1} saved to rouge_scores_fold{fold+1}_final.json")
            
            del model
            clean_memory()
        
        # Calculate and save average scores across all folds if ROUGE evaluation was performed
        if vocab_data is not None:
            avg_rouge1_f1 = np.mean([all_rouge_scores["folds"][f"fold_{fold+1}"]["rouge-1"]["f"] for fold in range(K_FOLDS)])
            avg_rouge2_f1 = np.mean([all_rouge_scores["folds"][f"fold_{fold+1}"]["rouge-2"]["f"] for fold in range(K_FOLDS)])
            
            print("\n" + "="*60)
            print("FINAL AVERAGE ROUGE SCORES ACROSS ALL FOLDS:")
            print("="*60)
            print(f"ROUGE-1 F1: {avg_rouge1_f1:.4f}")
            print(f"ROUGE-2 F1: {avg_rouge2_f1:.4f}")
            print("="*60)
            
            # Save all scores
            with open("all_rouge_scores.json", "w") as f:
                json.dump(all_rouge_scores, f, indent=4)
            print("\nAll ROUGE scores saved to all_rouge_scores.json")
        else:
            print("\n" + "="*60)
            print("ROUGE EVALUATION WAS SKIPPED DUE TO MISSING VOCABULARY FILES")
            print("="*60)
        
        print("\nTraining complete!")
        
    except Exception as e:
        print(f"Error during training: {e}")
        import traceback
        traceback.print_exc()

# Execute training if this script is run directly
if __name__ == "__main__":
    train_with_memory_monitoring()

Starting training with memory monitoring
Initial memory usage: 439.81 MB
Text embeddings file size: 14524.50 MB
Title embeddings file size: 14524.50 MB
Text embedding dimensions: (1, 512, 768)
Title embedding dimensions: (1, 512, 768)
Please ensure that vocab_embeddings.npy and vocab_tokens.npy exist in the current directory.
These files are required for calculating ROUGE scores.
Training will continue without ROUGE evaluation.
Total samples available: 9683

Fold 1/5
Before fold training memory usage: 439.84 MB
Training samples: 7746
Validation samples: 1937




Epoch 1/3
Training on chunk 1/16 (samples 0-499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417ms/step - loss: 19.7849Memory usage after epoch 1: 3209.24 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 421ms/step - loss: 19.7637
After cleanup memory usage: 1573.13 MB
Training on chunk 2/16 (samples 500-999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455ms/step - loss: 14.6108Memory usage after epoch 1: 3254.49 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 458ms/step - loss: 14.5948
After cleanup memory usage: 1618.12 MB
Training on chunk 3/16 (samples 1000-1499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 423ms/step - loss: 10.7219Memory usage after epoch 1: 3305.06 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 426ms/step - loss: 10.7099
After cleanup memory usage: 1592.96 MB
Training on chunk 4/16 (samples 1500-1999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m


Epoch 1/3
Training on chunk 1/16 (samples 0-499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step - loss: 19.7876Memory usage after epoch 1: 3379.72 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 294ms/step - loss: 19.7663
After cleanup memory usage: 1743.34 MB
Training on chunk 2/16 (samples 500-999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - loss: 14.6050Memory usage after epoch 1: 3449.49 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 298ms/step - loss: 14.5890
After cleanup memory usage: 1812.51 MB
Training on chunk 3/16 (samples 1000-1499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300ms/step - loss: 10.7218Memory usage after epoch 1: 3401.57 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 303ms/step - loss: 10.7098
After cleanup memory usage: 1765.47 MB
Training on chunk 4/16 (samples 1500-1999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m


Epoch 1/3
Training on chunk 1/16 (samples 0-499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step - loss: 19.8058Memory usage after epoch 1: 3482.36 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 314ms/step - loss: 19.7845
After cleanup memory usage: 1846.24 MB
Training on chunk 2/16 (samples 500-999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step - loss: 14.6259Memory usage after epoch 1: 3539.32 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 334ms/step - loss: 14.6099
After cleanup memory usage: 1900.73 MB
Training on chunk 3/16 (samples 1000-1499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 324ms/step - loss: 10.7363Memory usage after epoch 1: 3545.27 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 329ms/step - loss: 10.7244
After cleanup memory usage: 1909.16 MB
Training on chunk 4/16 (samples 1500-1999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m


Epoch 1/3
Training on chunk 1/16 (samples 0-499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 311ms/step - loss: 19.8149Memory usage after epoch 1: 3582.45 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 315ms/step - loss: 19.7936
After cleanup memory usage: 1946.33 MB
Training on chunk 2/16 (samples 500-999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 338ms/step - loss: 14.6363Memory usage after epoch 1: 3665.18 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 342ms/step - loss: 14.6202
After cleanup memory usage: 2029.07 MB
Training on chunk 3/16 (samples 1000-1499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step - loss: 10.7513Memory usage after epoch 1: 3671.16 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 338ms/step - loss: 10.7393
After cleanup memory usage: 1960.66 MB
Training on chunk 4/16 (samples 1500-1999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m


Epoch 1/3
Training on chunk 1/16 (samples 0-499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step - loss: 19.7843Memory usage after epoch 1: 3686.11 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 302ms/step - loss: 19.7629
After cleanup memory usage: 2050.00 MB
Training on chunk 2/16 (samples 500-999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318ms/step - loss: 14.6068Memory usage after epoch 1: 3867.50 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 323ms/step - loss: 14.5908
After cleanup memory usage: 2230.20 MB
Training on chunk 3/16 (samples 1000-1499)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step - loss: 10.7196Memory usage after epoch 1: 3868.75 MB
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 317ms/step - loss: 10.7077
After cleanup memory usage: 2232.64 MB
Training on chunk 4/16 (samples 1500-1999)
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m