# Tufts HPC setup and discovery

Goals:
- Keep HOME clean; work under `/cluster/tufts/datalab/zwu09` (and class path)
- Inspect storage and cluster resources
- Create a Python env + Jupyter kernel
- Launch a tunneled Jupyter server on a compute node

## Current Session Status
- **Job ID**: 15686311
- **Node**: s1cmp005.pax.tufts.edu
- **Resources**: 2x A100 80GB GPUs, 40 CPUs, 40GB RAM
- **Jupyter Server**: Running on port 8891 with token "allen"



In [1]:
# Test kernel connection and check current session resources
import os
import sys
import torch

print("=== Kernel Connection Test ===")
print(f"Python version: {sys.version}")
print(f"Working directory: {os.getcwd()}")
print(f"User: {os.getenv('USER', 'unknown')}")
print(f"Node: {os.getenv('HOSTNAME', 'unknown')}")
print(f"Job ID: {os.getenv('SLURM_JOB_ID', 'unknown')}")
print()

print("=== GPU Resources ===")
if torch.cuda.is_available():
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"GPU count: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
else:
    print("CUDA not available")
print()

print("=== CPU and Memory ===")
print(f"CPU count: {os.cpu_count()}")
print(f"Memory: {os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') / (1024**3):.1f} GB")


=== Kernel Connection Test ===
Python version: 3.12.5 | packaged by conda-forge | (main, Aug  8 2024, 18:36:51) [GCC 12.4.0]
Working directory: /cluster/tufts/datalab/zwu09
User: zwu09
Node: login-prod-01.pax.tufts.edu
Job ID: 15686311

=== GPU Resources ===
CUDA available: True
GPU count: 2
GPU 0: NVIDIA A100 80GB PCIe
  Memory: 79.2 GB
GPU 1: NVIDIA A100 80GB PCIe
  Memory: 79.2 GB

=== CPU and Memory ===
CPU count: 128
Memory: 1007.3 GB


In [None]:
# Storage and cluster discovery - keeping HOME clean
import subprocess

print("=== Storage Information ===")
print("Datalab storage:")
!df -h /cluster/tufts/datalab
print("\nClass storage:")
!df -h /cluster/tufts/em212class
print("\nHome directory usage (should be minimal):")
!du -h --max-depth=1 ~ | sort -hr | head -n 10
print()

print("=== Current Job Status ===")
!squeue -u $USER
print()

print("=== Cluster Partitions ===")
!sinfo -o "%P %a %l %D %c %m %G" | sed -n '1,20p'


In [None]:
# Test GPU functionality with both A100s
import torch
import time

print("=== GPU Functionality Test ===")

if torch.cuda.is_available():
    print(f"Testing {torch.cuda.device_count()} GPUs...")
    
    for i in range(torch.cuda.device_count()):
        print(f"\n--- GPU {i} Test ---")
        torch.cuda.set_device(i)
        
        # Create a test tensor
        device = torch.device(f'cuda:{i}')
        x = torch.randn(1000, 1000, device=device)
        y = torch.randn(1000, 1000, device=device)
        
        # Test computation
        start_time = time.time()
        z = torch.matmul(x, y)
        torch.cuda.synchronize()
        end_time = time.time()
        
        print(f"Matrix multiplication on GPU {i}: {end_time - start_time:.4f} seconds")
        print(f"Result shape: {z.shape}")
        print(f"GPU memory allocated: {torch.cuda.memory_allocated(i) / 1024**2:.1f} MB")
        print(f"GPU memory cached: {torch.cuda.memory_reserved(i) / 1024**2:.1f} MB")
        
        # Clear memory
        del x, y, z
        torch.cuda.empty_cache()
        
    print("\n=== Multi-GPU Test ===")
    if torch.cuda.device_count() > 1:
        # Test data transfer between GPUs
        x = torch.randn(100, 100, device='cuda:0')
        y = x.to('cuda:1')
        print(f"Data transfer from GPU 0 to GPU 1: {x.shape} -> {y.shape}")
        print("Multi-GPU setup working correctly!")
    else:
        print("Only one GPU available")
        
else:
    print("CUDA not available - check your setup")


In [None]:
# System Check with HPC Commands
import subprocess
import time

def run_cmd(cmd, desc):
    print(f"[{time.strftime('%H:%M:%S')}] {desc}")
    try:
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
        print(result.stdout)
        if result.stderr:
            print(f"STDERR: {result.stderr}")
    except Exception as e:
        print(f"Error: {e}")
    print()

# Check system using proper HPC commands
run_cmd("df -h /cluster/tufts/datalab", "Disk space")
run_cmd("squeue -u $USER", "My jobs") 
run_cmd("nvidia-smi", "GPU status")
run_cmd("pip list | grep -E '(torch|diffusers|transformers)'", "Key packages")


In [None]:
# Simple Diffusion Model Test (with auto-install)
import torch
import time
import os
import subprocess
import sys

print("=== Simple Diffusion Model Test ===")

# Set up cache directories
os.environ['HF_HOME'] = '/cluster/tufts/datalab/zwu09/caches/huggingface'
os.environ['TRANSFORMERS_CACHE'] = '/cluster/tufts/datalab/zwu09/caches/huggingface'
os.environ['TORCH_HOME'] = '/cluster/tufts/datalab/zwu09/caches/torch'

cache_dirs = [
    '/cluster/tufts/datalab/zwu09/caches/huggingface',
    '/cluster/tufts/datalab/zwu09/caches/torch'
]
for cache_dir in cache_dirs:
    os.makedirs(cache_dir, exist_ok=True)

print(f"Cache directories set up: {cache_dirs}")

# Check and install required packages
required_packages = ['diffusers', 'transformers', 'accelerate', 'pillow']
missing_packages = []

for package in required_packages:
    try:
        __import__(package)
        print(f"✅ {package} available")
    except ImportError:
        missing_packages.append(package)
        print(f"❌ {package} not available")

if missing_packages:
    print(f"\nInstalling missing packages: {missing_packages}")
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing_packages)
        print("✅ Packages installed successfully")
    except Exception as e:
        print(f"❌ Failed to install packages: {e}")
        print("Continuing with basic GPU tests...")

# Try to run a simple diffusion test
try:
    from diffusers import StableDiffusionPipeline
    import matplotlib.pyplot as plt
    from PIL import Image
    
    print("\n=== Running Diffusion Model Test ===")
    print("Loading Stable Diffusion model...")
    
    # Use a smaller model for testing
    model_id = "runwayml/stable-diffusion-v1-5"
    
    pipe = StableDiffusionPipeline.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        use_safetensors=True,
        cache_dir='/cluster/tufts/datalab/zwu09/caches/huggingface'
    )
    
    pipe = pipe.to("cuda:0")
    pipe.enable_attention_slicing()
    
    print("✅ Model loaded successfully!")
    
    # Generate a simple image
    prompt = "a simple red circle on white background"
    print(f"Generating image with prompt: '{prompt}'")
    
    start_time = time.time()
    image = pipe(
        prompt,
        num_inference_steps=10,  # Very fast for testing
        guidance_scale=7.5,
        height=256,
        width=256
    ).images[0]
    
    generation_time = time.time() - start_time
    print(f"✅ Image generated in {generation_time:.2f} seconds")
    
    # Save image
    output_path = "/cluster/tufts/datalab/zwu09/simple_diffusion_test.png"
    image.save(output_path)
    print(f"Image saved to: {output_path}")
    
    # Display image
    plt.figure(figsize=(6, 6))
    plt.imshow(image)
    plt.axis('off')
    plt.title(f"Generated in {generation_time:.2f}s on A100")
    plt.show()
    
    print("✅ Diffusion model test completed successfully!")
    
except Exception as e:
    print(f"❌ Diffusion test failed: {e}")
    print("Running fallback GPU performance test...")
    
    # Fallback: Advanced GPU test
    print("\n=== Fallback: Advanced GPU Test ===")
    device = torch.device("cuda:0")
    
    # Test large matrix operations
    sizes = [1000, 2000, 4000]
    for size in sizes:
        try:
            print(f"Testing {size}x{size} matrices...")
            x = torch.randn(size, size, device=device, dtype=torch.float16)
            y = torch.randn(size, size, device=device, dtype=torch.float16)
            
            start = time.time()
            z = torch.matmul(x, y)
            torch.cuda.synchronize()
            end = time.time()
            
            print(f"  Matrix multiplication: {end-start:.4f} seconds")
            print(f"  Memory used: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
            
            del x, y, z
            torch.cuda.empty_cache()
            
        except RuntimeError as e:
            print(f"  ❌ Failed at size {size}: {e}")
            break
    
    print("✅ Fallback GPU test completed!")

# Multi-GPU test
print("\n=== Multi-GPU Test ===")
if torch.cuda.device_count() > 1:
    print(f"Testing {torch.cuda.device_count()} GPUs...")
    
    device0 = torch.device("cuda:0")
    device1 = torch.device("cuda:1")
    
    # Test data transfer
    x0 = torch.randn(1000, 1000, device=device0, dtype=torch.float16)
    start = time.time()
    x0_to_1 = x0.to(device1)
    torch.cuda.synchronize()
    transfer_time = time.time() - start
    print(f"GPU 0 → GPU 1 transfer: {transfer_time:.4f} seconds")
    
    # Test parallel computation
    x1 = torch.randn(1000, 1000, device=device1, dtype=torch.float16)
    start = time.time()
    result0 = torch.matmul(x0, x0)
    result1 = torch.matmul(x1, x1)
    torch.cuda.synchronize()
    parallel_time = time.time() - start
    print(f"Parallel computation: {parallel_time:.4f} seconds")
    
    print("✅ Multi-GPU operations working correctly!")
else:
    print("Only one GPU detected")

# Clean up
torch.cuda.empty_cache()
print("\n✅ All tests completed! Your A100 GPUs are ready for high-performance computing!")


In [None]:
# Create env + kernel under datalab (one-time)
!python3 -m venv /cluster/tufts/datalab/zwu09/envs/hoc
!/cluster/tufts/datalab/zwu09/envs/hoc/bin/pip install --upgrade pip ipykernel jupyterlab
!/cluster/tufts/datalab/zwu09/envs/hoc/bin/python -m ipykernel install --user --name hoc --display-name "Tufts HPC (hoc)"


## Current Session Setup (COMPLETED ✅)

### Resources Allocated:
- **2x A100 80GB GPUs** (Job ID: 15686311)
- **40 CPUs, 40GB RAM**
- **Node**: s1cmp005.pax.tufts.edu

### Jupyter Server Running:
```bash
# Already running on compute node:
jupyter lab --no-browser --ip 127.0.0.1 --port 8891 --ServerApp.token=allen \
  --NotebookApp.notebook_dir=/cluster/tufts/datalab/zwu09
```

### SSH Tunnel Established:
```bash
# Already connected from laptop:
ssh -J zwu09@login.pax.tufts.edu -L 8891:127.0.0.1:8891 zwu09@s1cmp005.pax.tufts.edu
```

### Cursor Connection:
- **Jupyter server URL**: `http://localhost:8891/?token=allen`
- **Kernel**: Tufts HPC (hoc) or allenML2


In [None]:
# Optional: set caches/temp under datalab for this session
export HF_HOME=/cluster/tufts/datalab/zwu09/caches/huggingface
export TRANSFORMERS_CACHE=/cluster/tufts/datalab/zwu09/caches/huggingface
export PIP_CACHE_DIR=/cluster/tufts/datalab/zwu09/caches/pip
export TORCH_HOME=/cluster/tufts/datalab/zwu09/caches/torch
export TMPDIR=/cluster/tufts/datalab/zwu09/tmp
mkdir -p "$HF_HOME" "$TRANSFORMERS_CACHE" "$PIP_CACHE_DIR" "$TORCH_HOME" "$TMPDIR"

echo "HF_HOME=$HF_HOME"
echo "TRANSFORMERS_CACHE=$TRANSFORMERS_CACHE"
echo "PIP_CACHE_DIR=$PIP_CACHE_DIR"
echo "TORCH_HOME=$TORCH_HOME"
echo "TMPDIR=$TMPDIR"


In [None]:
10+10