In [2]:
import numpy
from cupy.cuda.runtime import getDeviceCount
from mpi4py import MPI
import numpy as np

from cuquantum import Network

import cuquantum as cq

expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]

device = 'cuda'
# Create torch tensors.
operands = [numpy.random.rand(*shape) for shape in shapes]

# Create the network.
with Network(expr, *operands) as n:

    root = 0
    comm = MPI.COMM_WORLD

    rank, size = comm.Get_rank(), comm.Get_size()

    time0 = MPI.Wtime()

    # Read in a pytket circuit (same on each process)

    # Set the operand data (same on all processes).
    if rank == root: print(f"Contracting {int(len(ovl)/2)} tensors.")

    # Assign the device for each process.
    device_id = rank % getDeviceCount()

    # Create network object.
    network = n

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(optimize={'samples': 4, 'slicing': {'min_slices': max(16, size)}})

    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
    if rank == root: print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")

    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)

    # Set path and slices.
    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})

    time1 = MPI.Wtime()
    duration = time1 - time0
    print(f"Optimising contraction path at {rank} took {duration} sec.")

    # Calculate this process's share of the slices.
    num_slices = info.num_slices
    chunk, extra = num_slices // size, num_slices % size
    slice_begin = rank * chunk + min(rank, extra)
    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
    slices = range(slice_begin, slice_end)

    print(f"Process {rank} is processing slice range: {slices}.")

    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)

    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
    if rank == root: print(f"Result: {result}")

    time2 = MPI.Wtime()
    duration = time2 - time1
    print(f"Contraction at {rank} took {duration} sec.")

ModuleNotFoundError: No module named 'mpi4py'

In [None]:
import numpy
from cupy.cuda.runtime import getDeviceCount
from mpi4py import MPI
import numpy as np

from cuquantum import Network

import cuquantum as cq

expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]

device = 'cuda'
# Create torch tensors.
operands = [numpy.random.rand(*shape) for shape in shapes]

# Create the network.
with Network(expr, *operands) as n:

    # Read in a pytket circuit (same on each process)

    # Set the operand data (same on all processes).

    # Assign the device for each process.

    # Create network object.
    network = n

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(optimize={'samples': 4, 'slicing': {'min_slices': 4}})

    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
    if rank == root: print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")

    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)

    # Set path and slices.
    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})

    print(f"Optimising contraction path at {rank} took {duration} sec.")

    # Calculate this process's share of the slices.
    num_slices = info.num_slices
    chunk, extra = num_slices // size, num_slices % size
    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
    slices = range(slice_begin, slice_end)

    print(f"Process {rank} is processing slice range: {slices}.")

    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)

    # Sum the partial contribution from each process on root.

    duration = time2 - time1
    print(f"Contraction at {rank} took {duration} sec.")