# GPU Test Notebook - Verify Training Environment
# Test GPU availability and basic training functionality

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time
import numpy as np

print("🔍 GPU Environment Test")
print("=" * 50)

🔍 GPU Environment Test


## Test 1: Basic GPU Detection


In [2]:
print("\n1️⃣ Basic CUDA Detection")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        print(f"  GPU {i}: {props.name}")
        print(f"    Total memory: {props.total_memory / 1e9:.1f} GB")
        print(f"    Multiprocessors: {props.multi_processor_count}")
        print(f"    CUDA capability: {props.major}.{props.minor}")
else:
    print("❌ CUDA not available - training will be slow!")


1️⃣ Basic CUDA Detection
PyTorch version: 2.7.0+cu128
CUDA available: True
CUDA version: 12.8
Number of GPUs: 1
  GPU 0: NVIDIA GeForce RTX 5090
    Total memory: 34.2 GB
    Multiprocessors: 170
    CUDA capability: 12.0


## Test 2: Memory Test


In [3]:
print("\n2️⃣ GPU Memory Test")

if torch.cuda.is_available():
    device = torch.device("cuda")
    
    # Clear cache
    torch.cuda.empty_cache()
    
    # Check initial memory
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f"Initial - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
    
    # Test large tensor allocation
    try:
        print("Testing large tensor allocation...")
        test_tensor = torch.randn(1000, 1000, 1000, device=device)  # ~4GB
        
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        print(f"After allocation - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
        
        # Clean up
        del test_tensor
        torch.cuda.empty_cache()
        print("✅ Large tensor allocation successful")
        
    except RuntimeError as e:
        print(f"❌ Memory allocation failed: {e}")
        print("This might indicate insufficient GPU memory")
        
else:
    print("⏭️ Skipping GPU memory test (CUDA not available)")


2️⃣ GPU Memory Test
Initial - Allocated: 0.00 GB, Reserved: 0.00 GB
Testing large tensor allocation...
After allocation - Allocated: 4.00 GB, Reserved: 4.00 GB
✅ Large tensor allocation successful


## Test 3: Simple Neural Network Training

In [5]:
# Create a simple neural network
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(1000, 5000)
        self.fc2 = nn.Linear(5000, 5000)
        self.fc3 = nn.Linear(5000, 250)
        self.fc4 = nn.Linear(250, 25)
        self.fc5 = nn.Linear(25, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x

# Generate dummy data
X = torch.randn(1000, 1000)
y = torch.randn(1000, 1)

# Create dataset and dataloader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize model
model = SimpleNet()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Test CPU training first
print("Testing CPU training...")
device = torch.device("cpu")
model = model.to(device)

start_time = time.time()
model.train()
total_loss = 0

for batch_idx, (data, target) in enumerate(dataloader):
    if batch_idx >= 5:  # Only test 5 batches
        break
        
    data, target = data.to(device), target.to(device)
    
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    
    total_loss += loss.item()

cpu_time = time.time() - start_time
cpu_loss = total_loss / 5
print(f"CPU - Time: {cpu_time:.2f}s, Avg Loss: {cpu_loss:.4f}")

# Test GPU training (if available)
# Test GPU training (if available)
if torch.cuda.is_available():
    print("Testing GPU training...")
    device = torch.device("cuda")
    model = model.to(device)
    
    # Need to recreate optimizer after moving model to GPU
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    start_time = time.time()
    model.train()
    total_loss = 0
    
    for batch_idx, (data, target) in enumerate(dataloader):
        if batch_idx >= 5:  # Only test 5 batches
            break
            
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    gpu_time = time.time() - start_time
    gpu_loss = total_loss / 5
    print(f"GPU - Time: {gpu_time:.2f}s, Avg Loss: {gpu_loss:.4f}")
    
    if cpu_time > 0:
        speedup = cpu_time / gpu_time
        print(f"🚀 GPU speedup: {speedup:.1f}x")
        
else:
    print("⏭️ Skipping GPU training test (CUDA not available)")



Testing CPU training...
CPU - Time: 0.32s, Avg Loss: 1.5625
Testing GPU training...
GPU - Time: 0.04s, Avg Loss: 1.1197
🚀 GPU speedup: 7.6x


## Test 4: Transformers Library GPU Test


In [6]:
print("\n4️⃣ Transformers Library Test")

try:
    from transformers import AutoTokenizer, AutoModel
    import torch.nn.functional as F
    
    print("Testing small transformer model on GPU...")
    
    # Use a small model for testing
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Test input
    text = "This is a test sentence for GPU processing."
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
        model = model.to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        print(f"Model device: {next(model.parameters()).device}")
        print(f"Input device: {inputs['input_ids'].device}")
        
        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs)
        
        print(f"Output shape: {outputs.last_hidden_state.shape}")
        print(f"Output device: {outputs.last_hidden_state.device}")
        print("✅ Transformers GPU test successful")
        
    else:
        print("⏭️ Skipping transformers GPU test (CUDA not available)")
        
except ImportError:
    print("❌ Transformers library not installed")
    print("Install with: pip install transformers")
except Exception as e:
    print(f"❌ Transformers test failed: {e}")


4️⃣ Transformers Library Test


  from .autonotebook import tqdm as notebook_tqdm


Testing small transformer model on GPU...
Model device: cuda:0
Input device: cuda:0
Output shape: torch.Size([1, 12, 768])
Output device: cuda:0
✅ Transformers GPU test successful


## Test 5: Mixed Precision Test (for training efficiency)


In [7]:
print("\n5️⃣ Mixed Precision Test")

if torch.cuda.is_available():
    try:
        from torch.cuda.amp import autocast, GradScaler
        
        # Create model and data
        model = SimpleNet().cuda()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scaler = GradScaler()
        
        # Test data
        x = torch.randn(32, 1000).cuda()
        y = torch.randn(32, 1).cuda()
        
        # Mixed precision forward pass
        with autocast():
            output = model(x)
            loss = F.mse_loss(output, y)
        
        # Backward pass with scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        print("✅ Mixed precision (AMP) test successful")
        print("This will speed up training on modern GPUs")
        
    except Exception as e:
        print(f"❌ Mixed precision test failed: {e}")
        print("Mixed precision may not be supported on this GPU")
else:
    print("⏭️ Skipping mixed precision test (CUDA not available)")


5️⃣ Mixed Precision Test
✅ Mixed precision (AMP) test successful
This will speed up training on modern GPUs


  scaler = GradScaler()
  with autocast():


## Test 6: Memory Management Test


In [8]:
print("\n6️⃣ Memory Management Test")

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
    def get_memory_info():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        return allocated, reserved
    
    print("Testing memory management patterns...")
    
    # Initial state
    alloc, res = get_memory_info()
    print(f"Initial: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    # Create some tensors
    tensors = []
    for i in range(5):
        tensor = torch.randn(100, 100, 100).cuda()  # ~40MB each
        tensors.append(tensor)
        
    alloc, res = get_memory_info()
    print(f"After creating tensors: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    # Delete tensors
    del tensors
    alloc, res = get_memory_info()
    print(f"After deleting tensors: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    # Clear cache
    torch.cuda.empty_cache()
    alloc, res = get_memory_info()
    print(f"After cache clear: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    print("✅ Memory management test complete")
    
else:
    print("⏭️ Skipping memory management test (CUDA not available)")


6️⃣ Memory Management Test
Testing memory management patterns...
Initial: 0.78 GB allocated, 0.79 GB reserved
After creating tensors: 0.80 GB allocated, 0.82 GB reserved
After deleting tensors: 0.78 GB allocated, 0.82 GB reserved
After cache clear: 0.78 GB allocated, 0.82 GB reserved
✅ Memory management test complete


## Summary

In [9]:
print("\n" + "="*50)
print("📊 GPU Test Summary")
print("="*50)

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name()
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    
    print(f"✅ GPU Ready: {gpu_name}")
    print(f"✅ Total VRAM: {total_memory:.1f} GB")
    print(f"✅ PyTorch CUDA: {torch.version.cuda}")
    
    # Recommendations based on GPU
    if "RTX 5090" in gpu_name or "RTX 4090" in gpu_name:
        print(f"\n🚀 RTX 5090/4090 Detected - Excellent for LoRA training!")
        print(f"   • Batch size: 4-8")
        print(f"   • Mixed precision: Recommended")
        print(f"   • 4-bit quantization: Optional")
        
    elif "L40S" in gpu_name or "A100" in gpu_name or "H100" in gpu_name:
        print(f"\n🚀 Professional GPU Detected - Perfect for large-scale training!")
        print(f"   • Batch size: 8-16")  
        print(f"   • Mixed precision: Highly recommended")
        print(f"   • Full precision training: Possible")
        
    else:
        print(f"\n⚠️  Unknown GPU - Training should work but may be slower")
        print(f"   • Start with small batch sizes")
        print(f"   • Use 4-bit quantization if memory is limited")
    
    print(f"\n🎯 Ready for LoRA training!")
    
else:
    print("❌ No GPU detected")
    print("   • Training will be very slow on CPU")
    print("   • Consider using Google Colab or cloud GPUs")
    print("   • Or check CUDA installation")

print(f"\n🔧 To install missing dependencies:")
print(f"pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
print(f"pip install transformers accelerate bitsandbytes")


📊 GPU Test Summary
✅ GPU Ready: NVIDIA GeForce RTX 5090
✅ Total VRAM: 34.2 GB
✅ PyTorch CUDA: 12.8

🚀 RTX 5090/4090 Detected - Excellent for LoRA training!
   • Batch size: 4-8
   • Mixed precision: Recommended
   • 4-bit quantization: Optional

🎯 Ready for LoRA training!

🔧 To install missing dependencies:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install transformers accelerate bitsandbytes


In [None]:
# GPU Test Notebook - Verify Training Environment
# Test GPU availability and basic training functionality

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time
import numpy as np

print("🔍 GPU Environment Test")
print("=" * 50)

## Test 1: Basic GPU Detection

print("\n1️⃣ Basic CUDA Detection")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        print(f"  GPU {i}: {props.name}")
        print(f"    Total memory: {props.total_memory / 1e9:.1f} GB")
        print(f"    Multiprocessors: {props.multi_processor_count}")
        print(f"    CUDA capability: {props.major}.{props.minor}")
else:
    print("❌ CUDA not available - training will be slow!")

## Test 2: Memory Test

print("\n2️⃣ GPU Memory Test")

if torch.cuda.is_available():
    device = torch.device("cuda")
    
    # Clear cache
    torch.cuda.empty_cache()
    
    # Check initial memory
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f"Initial - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
    
    # Test large tensor allocation
    try:
        print("Testing large tensor allocation...")
        test_tensor = torch.randn(1000, 1000, 1000, device=device)  # ~4GB
        
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        print(f"After allocation - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
        
        # Clean up
        del test_tensor
        torch.cuda.empty_cache()
        print("✅ Large tensor allocation successful")
        
    except RuntimeError as e:
        print(f"❌ Memory allocation failed: {e}")
        print("This might indicate insufficient GPU memory")
        
else:
    print("⏭️ Skipping GPU memory test (CUDA not available)")

## Test 3: Simple Neural Network Training

print("\n3️⃣ Simple Training Test")

# Create a simple neural network
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(1000, 5000)
        self.fc2 = nn.Linear(5000, 5000)
        self.fc3 = nn.Linear(5000, 250)
        self.fc4 = nn.Linear(250, 25)
        self.fc5 = nn.Linear(25, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x

# Generate dummy data
X = torch.randn(1000, 1000)
y = torch.randn(1000, 1)

# Create dataset and dataloader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize model
model = SimpleNet()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Test CPU training first
print("Testing CPU training...")
device = torch.device("cpu")
model = model.to(device)

start_time = time.time()
model.train()
total_loss = 0

for batch_idx, (data, target) in enumerate(dataloader):
    if batch_idx >= 5:  # Only test 5 batches
        break
        
    data, target = data.to(device), target.to(device)
    
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    
    total_loss += loss.item()

cpu_time = time.time() - start_time
cpu_loss = total_loss / 5
print(f"CPU - Time: {cpu_time:.2f}s, Avg Loss: {cpu_loss:.4f}")

# Test GPU training (if available)
if torch.cuda.is_available():
    print("Testing GPU training...")
    device = torch.device("cuda")
    model = model.to(device)
    
    # Need to recreate optimizer after moving model to GPU
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    start_time = time.time()
    model.train()
    total_loss = 0
    
    for batch_idx, (data, target) in enumerate(dataloader):
        if batch_idx >= 5:  # Only test 5 batches
            break
            
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    gpu_time = time.time() - start_time
    gpu_loss = total_loss / 5
    print(f"GPU - Time: {gpu_time:.2f}s, Avg Loss: {gpu_loss:.4f}")
    
    if cpu_time > 0:
        speedup = cpu_time / gpu_time
        print(f"🚀 GPU speedup: {speedup:.1f}x")
        
else:
    print("⏭️ Skipping GPU training test (CUDA not available)")

## Test 4: Transformers Library GPU Test

print("\n4️⃣ Transformers Library Test")

try:
    from transformers import AutoTokenizer, AutoModel
    import torch.nn.functional as F
    
    print("Testing small transformer model on GPU...")
    
    # Use a small model for testing
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Test input
    text = "This is a test sentence for GPU processing."
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
        model = model.to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        print(f"Model device: {next(model.parameters()).device}")
        print(f"Input device: {inputs['input_ids'].device}")
        
        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs)
        
        print(f"Output shape: {outputs.last_hidden_state.shape}")
        print(f"Output device: {outputs.last_hidden_state.device}")
        print("✅ Transformers GPU test successful")
        
    else:
        print("⏭️ Skipping transformers GPU test (CUDA not available)")
        
except ImportError:
    print("❌ Transformers library not installed")
    print("Install with: pip install transformers")
except Exception as e:
    print(f"❌ Transformers test failed: {e}")

## Test 5: Mixed Precision Test (for training efficiency)

print("\n5️⃣ Mixed Precision Test")

if torch.cuda.is_available():
    try:
        from torch.cuda.amp import autocast, GradScaler
        
        # Create model and data
        model = SimpleNet().cuda()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scaler = GradScaler()
        
        # Test data
        x = torch.randn(32, 1000).cuda()
        y = torch.randn(32, 1).cuda()
        
        # Mixed precision forward pass
        with autocast():
            output = model(x)
            loss = F.mse_loss(output, y)
        
        # Backward pass with scaling
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        print("✅ Mixed precision (AMP) test successful")
        print("This will speed up training on modern GPUs")
        
    except Exception as e:
        print(f"❌ Mixed precision test failed: {e}")
        print("Mixed precision may not be supported on this GPU")
else:
    print("⏭️ Skipping mixed precision test (CUDA not available)")

## Test 6: Memory Management Test

print("\n6️⃣ Memory Management Test")

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
    def get_memory_info():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        return allocated, reserved
    
    print("Testing memory management patterns...")
    
    # Initial state
    alloc, res = get_memory_info()
    print(f"Initial: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    # Create some tensors
    tensors = []
    for i in range(5):
        tensor = torch.randn(100, 100, 100).cuda()  # ~40MB each
        tensors.append(tensor)
        
    alloc, res = get_memory_info()
    print(f"After creating tensors: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    # Delete tensors
    del tensors
    alloc, res = get_memory_info()
    print(f"After deleting tensors: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    # Clear cache
    torch.cuda.empty_cache()
    alloc, res = get_memory_info()
    print(f"After cache clear: {alloc:.2f} GB allocated, {res:.2f} GB reserved")
    
    print("✅ Memory management test complete")
    
else:
    print("⏭️ Skipping memory management test (CUDA not available)")

## Summary

print("\n" + "="*50)
print("📊 GPU Test Summary")
print("="*50)

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name()
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    
    print(f"✅ GPU Ready: {gpu_name}")
    print(f"✅ Total VRAM: {total_memory:.1f} GB")
    print(f"✅ PyTorch CUDA: {torch.version.cuda}")
    
    # Recommendations based on GPU
    if "RTX 5090" in gpu_name or "RTX 4090" in gpu_name:
        print(f"\n🚀 RTX 5090/4090 Detected - Excellent for LoRA training!")
        print(f"   • Batch size: 4-8")
        print(f"   • Mixed precision: Recommended")
        print(f"   • 4-bit quantization: Optional")
        
    elif "L40S" in gpu_name or "A100" in gpu_name or "H100" in gpu_name:
        print(f"\n🚀 Professional GPU Detected - Perfect for large-scale training!")
        print(f"   • Batch size: 8-16")  
        print(f"   • Mixed precision: Highly recommended")
        print(f"   • Full precision training: Possible")
        
    else:
        print(f"\n⚠️  Unknown GPU - Training should work but may be slower")
        print(f"   • Start with small batch sizes")
        print(f"   • Use 4-bit quantization if memory is limited")
    
    print(f"\n🎯 Ready for LoRA training!")
    
else:
    print("❌ No GPU detected")
    print("   • Training will be very slow on CPU")
    print("   • Consider using Google Colab or cloud GPUs")
    print("   • Or check CUDA installation")

print(f"\n🔧 To install missing dependencies:")
print(f"pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
print(f"pip install transformers accelerate bitsandbytes")

🔍 GPU Environment Test

1️⃣ Basic CUDA Detection
PyTorch version: 2.7.0+cu128
CUDA available: True
CUDA version: 12.8
Number of GPUs: 1
  GPU 0: NVIDIA GeForce RTX 5090
    Total memory: 34.2 GB
    Multiprocessors: 170
    CUDA capability: 12.0

2️⃣ GPU Memory Test
Initial - Allocated: 0.00 GB, Reserved: 0.00 GB
Testing large tensor allocation...
After allocation - Allocated: 4.00 GB, Reserved: 4.00 GB
✅ Large tensor allocation successful

3️⃣ Simple Training Test
Testing CPU training...


  return F.mse_loss(input, target, reduction=self.reduction)


CPU - Time: 0.30s, Avg Loss: 2.1292
Testing GPU training...
GPU - Time: 0.12s, Avg Loss: 1.0425
🚀 GPU speedup: 2.5x

4️⃣ Transformers Library Test


  from .autonotebook import tqdm as notebook_tqdm


Testing small transformer model on GPU...
Model device: cuda:0
Input device: cuda:0
Output shape: torch.Size([1, 12, 768])
Output device: cuda:0
✅ Transformers GPU test successful

5️⃣ Mixed Precision Test
✅ Mixed precision (AMP) test successful
This will speed up training on modern GPUs

6️⃣ Memory Management Test
Testing memory management patterns...
Initial: 0.52 GB allocated, 0.55 GB reserved
After creating tensors: 0.54 GB allocated, 0.55 GB reserved
After deleting tensors: 0.53 GB allocated, 0.55 GB reserved
After cache clear: 0.53 GB allocated, 0.55 GB reserved
✅ Memory management test complete

📊 GPU Test Summary
✅ GPU Ready: NVIDIA GeForce RTX 5090
✅ Total VRAM: 34.2 GB
✅ PyTorch CUDA: 12.8

🚀 RTX 5090/4090 Detected - Excellent for LoRA training!
   • Batch size: 4-8
   • Mixed precision: Recommended
   • 4-bit quantization: Optional

🎯 Ready for LoRA training!

🔧 To install missing dependencies:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org

  scaler = GradScaler()
  with autocast():
  loss = F.mse_loss(output, y)
