In [1]:
!pip install torch transformers datasets evaluate sacrebleu rouge-score numpy accelerate tensorboard scikit-learn

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Colle

In [1]:
import json
import gc
import torch
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import evaluate
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
from datetime import datetime

# Configuration
MODEL_NAME = "google/flan-t5-base"
BATCH_SIZE = 4  # Reduced batch size to prevent memory issues
MAX_LENGTH = 256
MAX_TARGET_LENGTH = 512
NUM_EPOCHS = 5
LEARNING_RATE = 1e-5
OUTPUT_DIR = "flan-t5-house-model"
DATASET_PATH = "/teamspace/studios/this_studio/dataset/train.jsonl"
WARMUP_STEPS = 100
GRADIENT_ACCUMULATION_STEPS = 4

# Create output directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
OUTPUT_DIR = f"{OUTPUT_DIR}-{timestamp}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Disable wandb if not needed
os.environ["WANDB_DISABLED"] = "true"

# Modified load dataset function with better error handling
def load_dataset_from_jsonl(file_path):
    examples = []
    print(f"Reading data from {file_path}...")
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            if not line.strip():
                continue
            try:
                example = json.loads(line)
                if "nl_description" not in example or "house_json" not in example:
                    print(f"Skipping line {i}: missing required keys")
                    continue

                # Process the target as before:
                house_json = example["house_json"]
                for room in house_json.get("rooms", []):
                    for obj in room.get("objects", []):
                        if "position" in obj:
                            pos = obj["position"]
                            pos["x"] = round(pos["x"], 2)
                            pos["y"] = round(pos["y"], 2)
                            pos["z"] = round(pos["z"], 2)

                examples.append({
                    "source": example["nl_description"],
                    "target": json.dumps(house_json)
                })
            except Exception as e:
                print(f"Error processing line {i}: {e}")
                continue

    print(f"Successfully loaded {len(examples)} examples")
    return Dataset.from_list(examples)

# Initialize tokenizer and model
print(f"Loading model and tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Enable gradient checkpointing for memory efficiency
print("Enabling gradient checkpointing...")
# model.gradient_checkpointing_enable()

# Load and prepare datasets with better error handling
print(f"Loading dataset from: {DATASET_PATH}")
try:
    dataset = load_dataset_from_jsonl(DATASET_PATH)

    print(f"Dataset size: {len(dataset)} examples")
    if len(dataset) > 0:
        print(f"Sample example source: {dataset[0]['source'][:200]}...")
        print(f"Sample example target: {dataset[0]['target'][:200]}...")
    else:
        raise ValueError("Dataset is empty. Please check your data file.")
except Exception as e:
    print(f"Failed to load dataset: {e}")
    raise

# Split dataset (80% train, 10% validation, 10% test)
print("Splitting dataset...")
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
test_valid_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

dataset_dict = DatasetDict({
    "train": train_test_split["train"],
    "validation": test_valid_split["train"],
    "test": test_valid_split["test"]
})

print(f"Train size: {len(dataset_dict['train'])}")
print(f"Validation size: {len(dataset_dict['validation'])}")
print(f"Test size: {len(dataset_dict['test'])}")

# Improved preprocess function with better error handling
def preprocess_function(examples):
    # For debugging
    # for i, (source, target) in enumerate(zip(examples["source"][:3], examples["target"][:3])):
    #     print(f"Example {i} - Input: {source[:50]}...")
    #     print(f"Example {i} - Target: {target[:50]}...")

    inputs = examples["source"]
    targets = examples["target"]

    # Process inputs
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
    )

    # Process targets
    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        padding="max_length",
        truncation=True,
    )

    # Process labels to handle invalid tokens
    model_inputs["labels"] = labels["input_ids"].copy()

    # Replace padding token id with -100 so it's ignored in loss calculation
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            -100 if token == tokenizer.pad_token_id else token
            for token in model_inputs["labels"][i]
        ]

    return model_inputs

# Apply preprocessing with verbosity for debugging
print("Tokenizing datasets...")
tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
    batch_size=8  # Process in smaller batches
)

print("\nSample tokenized example structure:")
if len(tokenized_datasets["train"]) > 0:
    print(f"Keys: {list(tokenized_datasets['train'][0].keys())}")
    print(f"Input_ids shape: {len(tokenized_datasets['train'][0]['input_ids'])}")
    print(f"Labels shape: {len(tokenized_datasets['train'][0]['labels'])}")

    # Check for valid tokens in labels
    labels = tokenized_datasets['train'][0]['labels']
    valid_tokens = [t for t in labels if t != -100]
    print(f"Valid label tokens: {len(valid_tokens)} out of {len(labels)}")

    # Check if any tokens are outside vocabulary range
    if valid_tokens:
        max_token = max(valid_tokens)
        print(f"Max token ID: {max_token}, Vocab size: {tokenizer.vocab_size}")
        if max_token >= tokenizer.vocab_size:
            print(f"WARNING: Found token ID {max_token} >= vocab size {tokenizer.vocab_size}")

# Define data collator with proper handling of tensors
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length",
    max_length=MAX_LENGTH,
    label_pad_token_id=-100
)

# Improved metric calculation with error handling and verbose output
def compute_metrics(eval_pred):
    print("\nComputing evaluation metrics...")
    predictions, labels = eval_pred

    print(f"Predictions - type: {type(predictions)}, shape: {getattr(predictions, 'shape', 'N/A')}, dtype: {getattr(predictions, 'dtype', 'N/A')}")
    print(f"Labels - type: {type(labels)}, shape: {getattr(labels, 'shape', 'N/A')}, dtype: {getattr(labels, 'dtype', 'N/A')}")

    # --- Debugging: Inspect raw predictions and labels ---
    try:
        # Ensure they are numpy arrays for inspection
        if isinstance(predictions, tuple): # Sometimes predictions might be nested
             predictions = predictions[0]
        if not isinstance(predictions, np.ndarray):
            predictions = np.array(predictions)
        if not isinstance(labels, np.ndarray):
            labels = np.array(labels)

        print(f"Predictions (as np array) - shape: {predictions.shape}, dtype: {predictions.dtype}")
        print(f"Labels (as np array) - shape: {labels.shape}, dtype: {labels.dtype}")

        # Check min/max values BEFORE any processing
        # Handle potential empty arrays
        if predictions.size > 0:
            pred_min, pred_max = np.min(predictions), np.max(predictions)
            print(f"Raw Predictions - min: {pred_min}, max: {pred_max}")
            # Check for problematic floats if dtype is float
            if np.issubdtype(predictions.dtype, np.floating):
                 print(f"Raw Predictions - contains NaN: {np.isnan(predictions).any()}, contains Inf: {np.isinf(predictions).any()}")
                 # Attempt to convert NaNs/Infs or problematic floats if needed
                 # predictions = np.nan_to_num(predictions).astype(np.int64) # Example fix
        else:
             print("Raw Predictions - array is empty")

        if labels.size > 0:
            # Check labels *before* replacing -100
            label_min_raw, label_max_raw = np.min(labels), np.max(labels)
            print(f"Raw Labels - min: {label_min_raw}, max: {label_max_raw}")
        else:
            print("Raw Labels - array is empty")

        # Check pad token ID
        pad_token_id = tokenizer.pad_token_id
        print(f"Tokenizer pad_token_id: {pad_token_id} (type: {type(pad_token_id)})")
        if pad_token_id is None:
            print("WARNING: tokenizer.pad_token_id is None!")
            # Handle this case, maybe assign 0 or handle differently?
            # pad_token_id = 0 # Example fallback

        # Replace -100 in the labels AFTER inspection
        labels = np.where(labels != -100, labels, pad_token_id)

        # Check labels AFTER replacing -100
        if labels.size > 0:
             label_min_mod, label_max_mod = np.min(labels), np.max(labels)
             print(f"Modified Labels - min: {label_min_mod}, max: {label_max_mod}, dtype: {labels.dtype}")
        else:
             print("Modified Labels - array is empty")

        # --- End Debugging ---

        # Load metrics
        bleu_metric = evaluate.load("sacrebleu")
        rouge_metric = evaluate.load("rouge")

        # --- Explicitly Clean Predictions Before Decoding ---
        # Clip predictions to be within the valid vocab range might help
        # Ensure predictions are integers
        if predictions.size > 0:
            # Handle potential floats resulting from model errors? (Unlikely but possible)
            if np.issubdtype(predictions.dtype, np.floating):
                print("WARNING: Predictions have float dtype. Converting NaNs/Infs and casting to int.")
                predictions = np.nan_to_num(predictions) # Convert NaN to 0, Inf to large floats
                predictions = predictions.astype(np.int64) # Cast to integer

            # Clip to valid token ID range [0, vocab_size - 1]
            vocab_size = tokenizer.vocab_size
            predictions = np.clip(predictions, 0, vocab_size - 1)
            print(f"Cleaned Predictions - min: {np.min(predictions)}, max: {np.max(predictions)}, dtype: {predictions.dtype}")


        # Decode predictions
        print("Decoding predictions...")
        # Add error handling specifically around decoding
        try:
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        except Exception as decode_err:
             print(f"ERROR during prediction decoding: {decode_err}")
             print(f"Problematic predictions (first 10 of first batch): {predictions[0, :10] if predictions.ndim > 1 else predictions[:10]}")
             decoded_preds = ["DECODING_ERROR"] * len(predictions) # Fallback

        # Decode labels
        print("Decoding labels...")
        try:
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        except Exception as decode_err:
            print(f"ERROR during label decoding: {decode_err}")
            print(f"Problematic labels (first 10 of first batch): {labels[0, :10] if labels.ndim > 1 else labels[:10]}")
            decoded_labels = ["DECODING_ERROR"] * len(labels) # Fallback


        # Print some examples for debugging
        print("Decoded examples (first 3):")
        for i in range(min(3, len(decoded_preds))):
            print(f"\nExample {i}:")
            print(f"Prediction: {decoded_preds[i][:100]}...")
            print(f"Reference: {decoded_labels[i][:100]}...")

        # Convert decoded outputs to format expected by metrics
        decoded_preds_for_bleu = decoded_preds
        decoded_refs_for_bleu = [[label] for label in decoded_labels]

        # Compute BLEU score
        print("Computing BLEU...")
        try:
            bleu_result = bleu_metric.compute(
                predictions=decoded_preds_for_bleu,
                references=decoded_refs_for_bleu
            )
            bleu_score = bleu_result["score"] if bleu_result and "score" in bleu_result else 0.0 # Safely access score
            print(f"BLEU score: {bleu_score}")
        except Exception as e:
            print(f"Error computing BLEU: {e}")
            bleu_score = 0.0 # Use float for consistency

        # Compute ROUGE scores
        print("Computing ROUGE...")
        try:
            rouge_result = rouge_metric.compute(
                predictions=decoded_preds,
                references=decoded_labels,
                use_stemmer=True
            )

            result = {"bleu": bleu_score}
            print(f"ROUGE raw result: {rouge_result}") # Print the full rouge result for inspection

            for rouge_type in ["rouge1", "rouge2", "rougeL"]:
                 if rouge_result and rouge_type in rouge_result:
                    # ROUGE often returns floats directly now, or objects with attributes
                    rouge_value = rouge_result[rouge_type]
                    # Check for AggregateResult object (older versions?) or direct float
                    if hasattr(rouge_value, "mid") and hasattr(rouge_value.mid, "fmeasure"): # Older structure?
                        result[rouge_type] = float(rouge_value.mid.fmeasure)
                    elif hasattr(rouge_value, "fmeasure"): # Another possible structure
                        result[rouge_type] = float(rouge_value.fmeasure)
                    else: # Assume it's a direct float/int
                         try:
                             result[rouge_type] = float(rouge_value)
                         except (ValueError, TypeError) as convert_err:
                             print(f"Could not convert ROUGE value for {rouge_type}: {rouge_value}, Error: {convert_err}")
                             result[rouge_type] = 0.0

                    print(f"{rouge_type}: {result[rouge_type]}")
                 else:
                     print(f"ROUGE type '{rouge_type}' not found in results.")
                     result[rouge_type] = 0.0

            print(f"Computed metrics: {result}")
            return result

        except Exception as e:
            print(f"Error computing ROUGE: {e}")
            # Ensure default values are floats
            return {
                "bleu": bleu_score, "rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0
            }

    except Exception as e:
        import traceback
        print(f"General error in compute_metrics: {e}")
        print(traceback.format_exc()) # Print full traceback
        # Ensure default values are floats
        return {"bleu": 0.0, "rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}

# Training arguments with reduced complexity
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,        # or explicitly max_grad_norm=1.0
    max_grad_norm=1.0,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    logging_steps=10,  # More frequent logging
    logging_dir=f"{OUTPUT_DIR}/logs",
    generation_max_length=MAX_TARGET_LENGTH,
    generation_num_beams=2,  # Reduced beam search complexity
    # Set report_to to None to disable wandb integration
    report_to=None,
    run_name=f"flan-t5-house-{timestamp}"  # Set distinct run_name to avoid warning
)

# Fixed trainer class with correct method signature
class SafeTrainer(Seq2SeqTrainer):
    def training_step(self, model, inputs, optimizer_idx=None):
        try:
            # Print input shapes for debugging
            if self.state.global_step == 0:
                print("Debug input batch shapes:")
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        print(f"{k}: {v.shape}, dtype: {v.dtype}")

            # Call the parent training step
            return super().training_step(model, inputs)
        except Exception as e:
            print(f"Error in training step: {e}")
            print(f"Input batch keys: {list(inputs.keys())}")

            # Return a zero loss tensor to continue training
            return torch.tensor(0.0, requires_grad=True, device=model.device)

# Initialize trainer - fix the deprecation warning by removing tokenizer parameter
print("Initializing trainer...")
trainer = SafeTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Save model configuration
with open(f"{OUTPUT_DIR}/training_config.json", "w") as f:
    config = {
        "model_name": MODEL_NAME,
        "batch_size": BATCH_SIZE,
        "max_length": MAX_LENGTH,
        "max_target_length": MAX_TARGET_LENGTH,
        "num_epochs": NUM_EPOCHS,
        "learning_rate": LEARNING_RATE,
        "warmup_steps": WARMUP_STEPS,
        "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
        "dataset_path": DATASET_PATH,
        "train_size": len(dataset_dict["train"]),
        "validation_size": len(dataset_dict["validation"]),
        "test_size": len(dataset_dict["test"]),
        "timestamp": timestamp
    }
    json.dump(config, f, indent=2)

# Start training with better error handling
print(f"Starting training with {NUM_EPOCHS} epochs...")
try:
    # Force sync GPU before training
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    train_result = trainer.train()
    print(f"Training completed. Training loss: {train_result.training_loss}")

    # Print training metrics
    train_metrics = train_result.metrics
    print(f"Training metrics: {json.dumps(train_metrics, indent=2)}")

except Exception as e:
    import traceback
    print(f"Training error occurred: {e}")
    print(traceback.format_exc())
    print("Attempting to save partial training results...")

# Try to save the final model
try:
    print(f"Saving model to {OUTPUT_DIR}")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"Model saved to {OUTPUT_DIR}")
except Exception as e:
    print(f"Error saving model: {e}")

# Evaluation on test set with better error handling
try:
    print("Evaluating on test set...")
    # Clear CUDA cache before evaluation
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    results = trainer.evaluate(tokenized_datasets["test"], metric_key_prefix="test")
    print("Final test set evaluation results:")
    print(json.dumps(results, indent=2))

    # Save test results
    with open(f"{OUTPUT_DIR}/test_results.json", "w") as f:
        json.dump(results, f, indent=2)
except Exception as e:
    print(f"Error during evaluation: {e}")
    import traceback
    print(traceback.format_exc())

# Updated function to generate prediction with better error handling
def generate_prediction(input_text):
    try:
        print(f"Generating prediction for: {input_text[:100]}...")
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)

        # Move to the right device
        device = model.device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Clear GPU cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Generate with safer parameters
        outputs = model.generate(
            **inputs,
            max_length=MAX_TARGET_LENGTH,
            num_beams=2,  # Reduced complexity
            early_stopping=True
        )

        # Decode the output
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        print(f"Generated output: {decoded_output[:100]}...")
        return decoded_output
    except Exception as e:
        print(f"Error in generation: {e}")
        import traceback
        print(traceback.format_exc())
        return f"Error generating prediction: {str(e)}"

# Test with sample inputs
try:
    if len(dataset_dict["test"]) > 0:
        print("\nRunning generation test with sample inputs...")
        # Test with up to 3 samples
        num_samples = min(3, len(dataset_dict["test"]))

        test_results = []
        for i in range(num_samples):
            try:
                sample_input = dataset_dict["test"][i]["source"]
                reference = dataset_dict["test"][i]["target"]

                # Generate prediction
                prediction = generate_prediction(sample_input)

                test_results.append({
                    "input": sample_input,
                    "prediction": prediction,
                    "reference": reference
                })

                print(f"\nSample {i+1}:")
                print(f"Input: {sample_input[:100]}...")
                print(f"Prediction: {prediction[:100]}...")
                print(f"Reference: {reference[:100]}...")
            except Exception as e:
                print(f"Error processing sample {i}: {e}")

        # Save sample predictions
        with open(f"{OUTPUT_DIR}/sample_predictions.json", "w") as f:
            json.dump(test_results, f, indent=2)
except Exception as e:
    print(f"Error during sample testing: {e}")

# Create a safer inference script
inference_script = f"""#!/usr/bin/env python
import sys
import torch
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model and tokenizer
MODEL_PATH = "{OUTPUT_DIR}"
MAX_LENGTH = {MAX_LENGTH}
MAX_TARGET_LENGTH = {MAX_TARGET_LENGTH}

def load_model_and_tokenizer():
    try:
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

        print("Loading model...")
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

        # Move to GPU if available
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)

        print(f"Model loaded successfully (device: {{device}})")
        return tokenizer, model, device
    except Exception as e:
        print(f"Error loading model: {{e}}")
        sys.exit(1)

def simplify_json(json_obj):
    # Simplify floats in position data to reduce complexity
    if isinstance(json_obj, dict):
        for key, value in json_obj.items():
            if key == "position" and isinstance(value, dict):
                # Round position values
                for coord in value:
                    if isinstance(value[coord], float):
                        value[coord] = round(value[coord], 2)
            elif isinstance(value, (dict, list)):
                simplify_json(value)
    elif isinstance(json_obj, list):
        for item in json_obj:
            if isinstance(item, (dict, list)):
                simplify_json(item)
    return json_obj

def generate_description(input_json, tokenizer, model, device):
    try:
        # For house JSON input, parse and simplify
        if isinstance(input_json, str):
            # Check if the input is a path to a JSON file
            if input_json.endswith('.json'):
                with open(input_json, 'r') as f:
                    data = json.load(f)
            else:
                # Try to parse as direct JSON
                data = json.loads(input_json)
        else:
            data = input_json

        # Simplify the JSON to reduce complexity
        data = simplify_json(data)
        input_json_str = json.dumps(data)

        print("Tokenizing input...")
        inputs = tokenizer(input_json_str, return_tensors="pt", padding="max_length",
                          max_length=MAX_LENGTH, truncation=True)
        inputs = {{k: v.to(device) for k, v in inputs.items()}}

        print("Generating description...")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=MAX_TARGET_LENGTH,
                num_beams=2,
                early_stopping=True
            )

        description = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        return description
    except Exception as e:
        return f"Error generating description: {{e}}"

if __name__ == "__main__":
    # Load the model and tokenizer
    tokenizer, model, device = load_model_and_tokenizer()

    if len(sys.argv) > 1:
        # Use command line argument as input
        input_text = sys.argv[1]
    else:
        # Otherwise prompt for input
        input_text = input("Enter house JSON or path to JSON file: ")

    print("Processing input...")
    result = generate_description(input_text, tokenizer, model, device)
    print("\\nGenerated description:")
    print(result)
"""

# Save inference script
try:
    with open(f"{OUTPUT_DIR}/inference.py", "w") as f:
        f.write(inference_script)

    # Make it executable
    os.chmod(f"{OUTPUT_DIR}/inference.py", 0o755)
    print(f"Created inference script at {OUTPUT_DIR}/inference.py")
except Exception as e:
    print(f"Error creating inference script: {e}")

print(f"\nTraining completed! Model and resources saved to {OUTPUT_DIR}")
print("To use the model for inference, run:")
print(f"python {OUTPUT_DIR}/inference.py 'your house JSON data'")

Loading model and tokenizer: google/flan-t5-base
Enabling gradient checkpointing...
Loading dataset from: /teamspace/studios/this_studio/dataset/train.jsonl
Reading data from /teamspace/studios/this_studio/dataset/train.jsonl...


Successfully loaded 17350 examples
Dataset size: 17350 examples
Sample example source: A small home with 4 rooms on single floor, measuring 14x14 units.

A kitchen makes up room 1 containing a counter and a single dishwasher.
Room 2, a living room, is furnished with a end table, one lam...
Sample example target: {"id": "house_1", "numRooms": 4, "floors": 1, "dimensions": {"x": 14, "y": 14}, "rooms": [{"roomType": "kitchen", "name": "kitchen", "floorLevel": 0, "objects": [{"objectType": "counter", "assetId": "...
Splitting dataset...
Train size: 13880
Validation size: 1735
Test size: 1735
Tokenizing datasets...


Map:   0%|          | 0/13880 [00:00<?, ? examples/s]

Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1735 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Sample tokenized example structure:
Keys: ['input_ids', 'attention_mask', 'labels']
Input_ids shape: 256
Labels shape: 512
Valid label tokens: 512 out of 512
Max token ID: 31987, Vocab size: 32100
Initializing trainer...
Starting training with 5 epochs...
Debug input batch shapes:
input_ids: torch.Size([4, 256]), dtype: torch.int64
attention_mask: torch.Size([4, 256]), dtype: torch.int64
labels: torch.Size([4, 512]), dtype: torch.int64
decoder_input_ids: torch.Size([4, 512]), dtype: torch.int64


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Debug input batch shapes:
input_ids: torch.Size([4, 256]), dtype: torch.int64
attention_mask: torch.Size([4, 256]), dtype: torch.int64
labels: torch.Size([4, 512]), dtype: torch.int64
decoder_input_ids: torch.Size([4, 512]), dtype: torch.int64
Debug input batch shapes:
input_ids: torch.Size([4, 256]), dtype: torch.int64
attention_mask: torch.Size([4, 256]), dtype: torch.int64
labels: torch.Size([4, 512]), dtype: torch.int64
decoder_input_ids: torch.Size([4, 512]), dtype: torch.int64
Debug input batch shapes:
input_ids: torch.Size([4, 256]), dtype: torch.int64
attention_mask: torch.Size([4, 256]), dtype: torch.int64
labels: torch.Size([4, 512]), dtype: torch.int64
decoder_input_ids: torch.Size([4, 512]), dtype: torch.int64


Epoch,Training Loss,Validation Loss,Bleu,Rouge1,Rouge2,Rougel
1,0.2722,0.254159,81.9512,0.713382,0.520219,0.631682
2,0.2581,0.244886,82.624378,0.724419,0.531773,0.637353
3,0.2513,0.24015,82.5595,0.717977,0.52645,0.640531



Computing evaluation metrics...
Predictions - type: <class 'numpy.ndarray'>, shape: (1735, 512), dtype: int64
Labels - type: <class 'numpy.ndarray'>, shape: (1735, 512), dtype: int64
Predictions (as np array) - shape: (1735, 512), dtype: int64
Labels (as np array) - shape: (1735, 512), dtype: int64
Raw Predictions - min: 0, max: 31987
Raw Labels - min: 1, max: 31987
Tokenizer pad_token_id: 0 (type: <class 'int'>)
Modified Labels - min: 1, max: 31987, dtype: int64
Cleaned Predictions - min: 0, max: 31987, dtype: int64
Decoding predictions...
Decoding labels...
Decoded examples (first 3):

Example 0:
Prediction: "id": "house_2448", "numRooms": 7, "floors": 1, "dimensions": "x": 17, "y": 17, "rooms": ["roomType"...
Reference: "id": "house_4700", "numRooms": 7, "floors": 1, "dimensions": "x": 17, "y": 17, "rooms": ["roomType"...

Example 1:
Prediction: "id": "house_2448", "numRooms": 8, "floors": 1, "dimensions": "x": 21, "y": 21, "rooms": ["roomType"...
Reference: "id": "house_2495", "nu

KeyboardInterrupt: 