# Slurm Environment Test Notebook

This notebook emulates the exact Slurm environment to test both LoRA and Full Fine-tuning before submitting to Rangpur.

## What This Tests:
- ✅ `torchrun` with `--standalone --nproc_per_node=1`
- ✅ CUDA multiprocessing with `spawn` method
- ✅ Model loading order (datasets first, then model)
- ✅ Both LoRA and Full FT training strategies
- ✅ Automatic evaluation after training
- ✅ All the fixes we applied

## Usage:
1. Run the setup cell
2. Choose which test to run (LoRA or Full FT)
3. Monitor for any errors
4. If successful, submit to Rangpur with confidence!

## Important Note:
This notebook handles both repository structures:
- **Slurm**: Files directly in root (`/home/Student/s4800977/comp3710/a3/src/`, etc.)
- **GitHub**: Files in subdirectory (`recognition/layrad-flant5-lora-nchung/src/`, etc.)


## 1. Environment Setup (Mirrors Slurm)


In [None]:
# Import required libraries
import os
import sys
import subprocess
import multiprocessing
import torch
from pathlib import Path

# Set up environment variables (mirrors Slurm)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use first GPU
os.environ['HF_HOME'] = '/content/hf_cache'  # HuggingFace cache
os.environ['TRANSFORMERS_CACHE'] = '/content/hf_cache'

# Create cache directory
Path('/content/hf_cache').mkdir(exist_ok=True)

print("🔧 Environment Setup Complete")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"Python: {sys.executable}")
print(f"PyTorch: {torch.__version__}")
print(f"Multiprocessing start method: {multiprocessing.get_start_method()}")


## 2. Clone Repository & Navigate to Correct Directory


In [None]:
# Clone the repository and navigate to correct directory
import subprocess
import os

# Set project root (mirrors Slurm)
PROJECT_ROOT = "/content/comp3710/a3"
REPO_URL = "https://github.com/nchung/comp3710-a3.git"  # Replace with your actual repo

# Remove existing directory if it exists
if os.path.exists(PROJECT_ROOT):
    subprocess.run(["rm", "-rf", PROJECT_ROOT], check=True)

# Clone repository
print(f"📥 Cloning repository to {PROJECT_ROOT}...")
subprocess.run(["git", "clone", REPO_URL, PROJECT_ROOT], check=True)

# Change to project directory
os.chdir(PROJECT_ROOT)
print(f"✅ Repository cloned and changed to: {os.getcwd()}")

# List contents to verify
print("\n📁 Repository contents:")
subprocess.run(["ls", "-la"], check=True)

# Check for key files in root directory (Slurm structure)
print("\n🔍 Checking for key files in root:")
key_files = ["requirements.txt", "src/train.py", "configs/", "scripts/"]
root_files_found = 0
for file in key_files:
    if os.path.exists(file):
        print(f"✅ Found: {file}")
        root_files_found += 1
    else:
        print(f"❌ Missing: {file}")

# If files not found in root, look for subdirectory structure
if root_files_found < 2:  # If we don't have most files in root
    print("\n🔍 Files not in root, checking subdirectories...")
    subprocess.run(["find", ".", "-name", "train.py", "-type", "f"], check=True)
    subprocess.run(["find", ".", "-name", "requirements.txt", "-type", "f"], check=True)
    
    # Look for the actual project directory
    for root, dirs, files in os.walk("."):
        if "train.py" in files and "requirements.txt" in files:
            actual_project_dir = root
            print(f"\n✅ Found actual project directory: {actual_project_dir}")
            print(f"📁 Contents of {actual_project_dir}:")
            subprocess.run(["ls", "-la", actual_project_dir], check=True)
            
            # Change to the actual project directory
            os.chdir(actual_project_dir)
            print(f"✅ Changed to actual project directory: {os.getcwd()}")
            break

# Show final directory structure
print("\n📂 Final directory structure:")
subprocess.run(["find", ".", "-type", "d", "-maxdepth", "2"], check=True)

# Verify we're in the right place
print(f"\n🎯 Current working directory: {os.getcwd()}")
print("🔍 Final check for key files:")
for file in key_files:
    if os.path.exists(file):
        print(f"✅ Found: {file}")
    else:
        print(f"❌ Missing: {file}")


## 3. Install Dependencies (Robust Version)


In [None]:
# Install dependencies (robust version that handles missing requirements.txt)
print("📦 Installing dependencies...")

# Check if requirements.txt exists and what's in it
if os.path.exists("requirements.txt"):
    print("📄 Found requirements.txt:")
    with open("requirements.txt", "r") as f:
        content = f.read()
        print(content)
    
    try:
        # Try to install from requirements.txt
        subprocess.run(["pip", "install", "-r", "requirements.txt"], check=True)
        print("✅ Requirements.txt installed successfully")
    except subprocess.CalledProcessError as e:
        print(f"⚠️ Requirements.txt failed: {e}")
        print("📦 Installing core packages manually...")
        
        # Install core packages manually
        core_packages = [
            "transformers>=4.30.0",
            "datasets>=2.12.0", 
            "peft>=0.4.0",
            "evaluate>=0.4.0",
            "rouge-score>=0.1.2",
            "accelerate>=0.20.0",
            "torch>=2.0.0",
            "torchvision",
            "torchaudio"
        ]
        
        for package in core_packages:
            try:
                subprocess.run(["pip", "install", package], check=True)
                print(f"✅ Installed {package}")
            except subprocess.CalledProcessError:
                print(f"⚠️ Failed to install {package}")
else:
    print("❌ requirements.txt not found, installing core packages...")
    
    # Install core packages
    core_packages = [
        "transformers>=4.30.0",
        "datasets>=2.12.0", 
        "peft>=0.4.0",
        "evaluate>=0.4.0",
        "rouge-score>=0.1.2",
        "accelerate>=0.20.0"
    ]
    
    for package in core_packages:
        try:
            subprocess.run(["pip", "install", package], check=True)
            print(f"✅ Installed {package}")
        except subprocess.CalledProcessError:
            print(f"⚠️ Failed to install {package}")

print("✅ Dependencies installation complete")


## 4. Test Configuration Files


In [None]:
# Test that config files exist and are valid
import yaml

configs_to_test = [
    "configs/train_flant5_base_lora.yaml",
    "configs/train_t5_small_full.yaml"
]

for config_path in configs_to_test:
    print(f"\n🔍 Testing {config_path}...")
    
    if not os.path.exists(config_path):
        print(f"❌ Config file not found: {config_path}")
        continue
    
    try:
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
        
        print(f"✅ Config loaded successfully")
        print(f"   Model: {config.get('model', {}).get('name', 'Not specified')}")
        print(f"   Strategy: {config.get('training', {}).get('strategy', 'Not specified')}")
        print(f"   Output dir: {config.get('output_dir', 'Not specified')}")
        
    except Exception as e:
        print(f"❌ Error loading config: {e}")

print("\n✅ Configuration testing complete")


## 5. Test LoRA Training (Mirrors Slurm Script)


In [None]:
# Test LoRA training with torchrun (exact Slurm command)
print("🚀 Testing LoRA Training with torchrun...")
print("This mirrors: conda run -n torch torchrun --standalone --nproc_per_node=1 src/train.py configs/train_flant5_base_lora.yaml")

# Set multiprocessing start method to spawn (our fix)
multiprocessing.set_start_method('spawn', force=True)
print(f"✅ Multiprocessing start method set to: {multiprocessing.get_start_method()}")

# Run the exact command from Slurm script
cmd = [
    "python", "-m", "torch.distributed.run",  # This is torchrun
    "--standalone",
    "--nproc_per_node=1",
    "src/train.py",
    "configs/train_flant5_base_lora.yaml"
]

print(f"\n🔧 Running command: {' '.join(cmd)}")
print("\n📊 Training output:")
print("=" * 80)

try:
    # Run the training command
    result = subprocess.run(cmd, capture_output=False, text=True, cwd=os.getcwd())
    
    if result.returncode == 0:
        print("\n" + "=" * 80)
        print("✅ LoRA Training completed successfully!")
    else:
        print("\n" + "=" * 80)
        print(f"❌ LoRA Training failed with exit code: {result.returncode}")
        
except Exception as e:
    print(f"\n❌ Error running LoRA training: {e}")

print("\n🔍 Checking for output files...")
if os.path.exists("checkpoints/flan-t5-base-lora-biolaysumm"):
    print("✅ LoRA checkpoint directory created")
    subprocess.run(["ls", "-la", "checkpoints/flan-t5-base-lora-biolaysumm"], check=True)
else:
    print("❌ LoRA checkpoint directory not found")


# Slurm Environment Test Notebook

This notebook emulates the exact Slurm environment to test both LoRA and Full Fine-tuning before submitting to Rangpur.

## What This Tests:
- ✅ `torchrun` with `--standalone --nproc_per_node=1`
- ✅ CUDA multiprocessing with `spawn` method
- ✅ Model loading order (datasets first, then model)
- ✅ Both LoRA and Full FT training strategies
- ✅ Automatic evaluation after training
- ✅ All the fixes we applied

## Usage:
1. Run the setup cell
2. Choose which test to run (LoRA or Full FT)
3. Monitor for any errors
4. If successful, submit to Rangpur with confidence!


## 1. Environment Setup (Mirrors Slurm)


In [None]:
# Import required libraries
import os
import sys
import subprocess
import multiprocessing
import torch
from pathlib import Path

# Set up environment variables (mirrors Slurm)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use first GPU
os.environ['HF_HOME'] = '/content/hf_cache'  # HuggingFace cache
os.environ['TRANSFORMERS_CACHE'] = '/content/hf_cache'

# Create cache directory
Path('/content/hf_cache').mkdir(exist_ok=True)

print("🔧 Environment Setup Complete")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"Python: {sys.executable}")
print(f"PyTorch: {torch.__version__}")
print(f"Multiprocessing start method: {multiprocessing.get_start_method()}")


## 2. Clone Repository (Mirrors Slurm Script)


In [None]:
# Clone the repository (mirrors Slurm script behavior)
import subprocess
import os

# Set project root (mirrors Slurm)
PROJECT_ROOT = "/content/comp3710/a3"
REPO_URL = "https://github.com/nchung/comp3710-a3.git"  # Replace with your actual repo

# Remove existing directory if it exists
if os.path.exists(PROJECT_ROOT):
    subprocess.run(["rm", "-rf", PROJECT_ROOT], check=True)

# Clone repository
print(f"📥 Cloning repository to {PROJECT_ROOT}...")
subprocess.run(["git", "clone", REPO_URL, PROJECT_ROOT], check=True)

# Change to project directory
os.chdir(PROJECT_ROOT)
print(f"✅ Repository cloned and changed to: {os.getcwd()}")

# List contents to verify
print("\n📁 Repository contents:")
subprocess.run(["ls", "-la"], check=True)


## 3. Install Dependencies (Mirrors Slurm)


In [None]:
# Install dependencies (mirrors Slurm script)
print("📦 Installing dependencies...")

# Install from requirements.txt
subprocess.run(["pip", "install", "-r", "requirements.txt"], check=True)

# Install additional packages that might be needed
subprocess.run(["pip", "install", "torch", "torchvision", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cu118"], check=True)

print("✅ Dependencies installed")


## 4. Test Configuration Files


In [None]:
# Test that config files exist and are valid
import yaml

configs_to_test = [
    "configs/train_flant5_base_lora.yaml",
    "configs/train_t5_small_full.yaml"
]

for config_path in configs_to_test:
    print(f"\n🔍 Testing {config_path}...")
    
    if not os.path.exists(config_path):
        print(f"❌ Config file not found: {config_path}")
        continue
    
    try:
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
        
        print(f"✅ Config loaded successfully")
        print(f"   Model: {config.get('model', {}).get('name', 'Not specified')}")
        print(f"   Strategy: {config.get('training', {}).get('strategy', 'Not specified')}")
        print(f"   Output dir: {config.get('output_dir', 'Not specified')}")
        
    except Exception as e:
        print(f"❌ Error loading config: {e}")

print("\n✅ Configuration testing complete")


## 5. Test LoRA Training (Mirrors Slurm Script)


In [None]:
# Test LoRA training with torchrun (exact Slurm command)
print("🚀 Testing LoRA Training with torchrun...")
print("This mirrors: conda run -n torch torchrun --standalone --nproc_per_node=1 src/train.py configs/train_flant5_base_lora.yaml")

# Set multiprocessing start method to spawn (our fix)
multiprocessing.set_start_method('spawn', force=True)
print(f"✅ Multiprocessing start method set to: {multiprocessing.get_start_method()}")

# Run the exact command from Slurm script
cmd = [
    "python", "-m", "torch.distributed.run",  # This is torchrun
    "--standalone",
    "--nproc_per_node=1",
    "src/train.py",
    "configs/train_flant5_base_lora.yaml"
]

print(f"\n🔧 Running command: {' '.join(cmd)}")
print("\n📊 Training output:")
print("=" * 80)

try:
    # Run the training command
    result = subprocess.run(cmd, capture_output=False, text=True, cwd=PROJECT_ROOT)
    
    if result.returncode == 0:
        print("\n" + "=" * 80)
        print("✅ LoRA Training completed successfully!")
    else:
        print("\n" + "=" * 80)
        print(f"❌ LoRA Training failed with exit code: {result.returncode}")
        
except Exception as e:
    print(f"\n❌ Error running LoRA training: {e}")

print("\n🔍 Checking for output files...")
if os.path.exists("checkpoints/flan-t5-base-lora-biolaysumm"):
    print("✅ LoRA checkpoint directory created")
    subprocess.run(["ls", "-la", "checkpoints/flan-t5-base-lora-biolaysumm"], check=True)
else:
    print("❌ LoRA checkpoint directory not found")


## 6. Test Full Fine-tuning Training (Mirrors Slurm Script)


In [None]:
# Test Full Fine-tuning with torchrun (exact Slurm command)
print("🚀 Testing Full Fine-tuning Training with torchrun...")
print("This mirrors: conda run -n torch torchrun --standalone --nproc_per_node=1 src/train.py configs/train_t5_small_full.yaml")

# Set multiprocessing start method to spawn (our fix)
multiprocessing.set_start_method('spawn', force=True)
print(f"✅ Multiprocessing start method set to: {multiprocessing.get_start_method()}")

# Run the exact command from Slurm script
cmd = [
    "python", "-m", "torch.distributed.run",  # This is torchrun
    "--standalone",
    "--nproc_per_node=1",
    "src/train.py",
    "configs/train_t5_small_full.yaml"
]

print(f"\n🔧 Running command: {' '.join(cmd)}")
print("\n📊 Training output:")
print("=" * 80)

try:
    # Run the training command
    result = subprocess.run(cmd, capture_output=False, text=True, cwd=PROJECT_ROOT)
    
    if result.returncode == 0:
        print("\n" + "=" * 80)
        print("✅ Full Fine-tuning Training completed successfully!")
    else:
        print("\n" + "=" * 80)
        print(f"❌ Full Fine-tuning Training failed with exit code: {result.returncode}")
        
except Exception as e:
    print(f"\n❌ Error running Full Fine-tuning training: {e}")

print("\n🔍 Checking for output files...")
if os.path.exists("checkpoints/t5-small-full-biolaysumm"):
    print("✅ Full FT checkpoint directory created")
    subprocess.run(["ls", "-la", "checkpoints/t5-small-full-biolaysumm"], check=True)
else:
    print("❌ Full FT checkpoint directory not found")


## 7. Test Evaluation Script (Mirrors Slurm)


In [None]:
# Test evaluation script (mirrors Slurm eval_runner calls)
print("🔍 Testing Evaluation Script...")

# Test LoRA evaluation
if os.path.exists("checkpoints/flan-t5-base-lora-biolaysumm"):
    print("\n📊 Testing LoRA Evaluation...")
    cmd = ["python", "src/eval_runner.py", "configs/train_flant5_base_lora.yaml"]
    
    try:
        result = subprocess.run(cmd, capture_output=False, text=True, cwd=PROJECT_ROOT)
        if result.returncode == 0:
            print("✅ LoRA Evaluation completed successfully!")
        else:
            print(f"❌ LoRA Evaluation failed with exit code: {result.returncode}")
    except Exception as e:
        print(f"❌ Error running LoRA evaluation: {e}")
else:
    print("⚠️ Skipping LoRA evaluation - no checkpoint found")

# Test Full FT evaluation
if os.path.exists("checkpoints/t5-small-full-biolaysumm"):
    print("\n📊 Testing Full FT Evaluation...")
    cmd = ["python", "src/eval_runner.py", "configs/train_t5_small_full.yaml"]
    
    try:
        result = subprocess.run(cmd, capture_output=False, text=True, cwd=PROJECT_ROOT)
        if result.returncode == 0:
            print("✅ Full FT Evaluation completed successfully!")
        else:
            print(f"❌ Full FT Evaluation failed with exit code: {result.returncode}")
    except Exception as e:
        print(f"❌ Error running Full FT evaluation: {e}")
else:
    print("⚠️ Skipping Full FT evaluation - no checkpoint found")


## 8. Results Summary


In [None]:
# Summary of test results
print("📋 SLURM ENVIRONMENT TEST SUMMARY")
print("=" * 50)

# Check what was created
checkpoints_dir = Path("checkpoints")
if checkpoints_dir.exists():
    print("\n📁 Checkpoint directories created:")
    for item in checkpoints_dir.iterdir():
        if item.is_dir():
            print(f"   ✅ {item.name}")
            
            # Check for reports
            reports_dir = item / "reports"
            if reports_dir.exists():
                print(f"      📊 Reports: {list(reports_dir.glob('*'))}")
            else:
                print(f"      ⚠️ No reports directory")
else:
    print("❌ No checkpoints directory found")

print("\n🎯 Next Steps:")
print("1. If all tests passed, your Slurm scripts should work on Rangpur")
print("2. Submit both training jobs to Rangpur")
print("3. Monitor the logs for any issues")
print("4. Check the results in the checkpoint directories")

print("\n✅ Slurm environment test complete!")
