In [59]:
!apt-get update
!apt-get install -y openmpi-bin libopenmpi-dev
!pip install mpi4py numba


0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.1                                                                               Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acq

In [60]:
!python3 -c "from mpi4py import MPI; print(MPI.Get_version())"


(3, 1)


In [61]:
!apt-get update
!apt-get install -y openmpi-bin libopenmpi-dev
!pip install mpi4py numba


Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading packag

In [62]:
!mpirun --version


mpirun (Open MPI) 4.1.2

Report bugs to http://www.open-mpi.org/community/help/


In [63]:
%%writefile create_db.py
import random

def random_dna(length):
    """Generate a random DNA sequence of given length."""
    return ''.join(random.choice("ATCG") for _ in range(length))

def create_dna_database(filename="dna_database.txt", num_entries=2000, seq_length=12):
    """
    Create a DNA database file with specified number of entries and sequence length.
    Each line: ID,Name,DNA_Sequence
    """
    with open(filename, "w") as f:
        for i in range(1, num_entries + 1):
            id_ = i
            name = f"Person_{i}"
            sequence = random_dna(seq_length)
            f.write(f"{id_},{name},{sequence}\n")
    print(f"✅ DNA database created: {filename} with {num_entries} entries of {seq_length}-letter sequences")

# Create database with 2000 entries
if __name__ == "__main__":
    create_dna_database(num_entries=2000, seq_length=12)


Overwriting create_db.py


In [64]:
!python create_db.py


✅ DNA database created: dna_database.txt with 2000 entries of 12-letter sequences


In [65]:
%%writefile mpi_cuda_top5.py
from mpi4py import MPI
import numpy as np
from numba import cuda
import sys

# ================= MPI SETUP =================
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

# ================= DNA MASK =================
# Use command-line argument or default
if rank == 0:
    if len(sys.argv) > 1:
        DNA_MASK = sys.argv[1].upper()
    else:
        DNA_MASK = "ATCG"
else:
    DNA_MASK = None

# Broadcast to all ranks
DNA_MASK = comm.bcast(DNA_MASK, root=0)
MASK_LEN = len(DNA_MASK)

# ================= CUDA KERNEL =================
@cuda.jit
def dna_kernel(dna_array, mask, results):
    i = cuda.grid(1)
    if i < dna_array.shape[0]:
        count = 0
        for j in range(dna_array.shape[1] - mask.shape[0]):
            match = True
            for k in range(mask.shape[0]):
                if dna_array[i, j + k] != mask[k]:
                    match = False
                    break
            if match:
                count += 1
        results[i] = count

# ================= READ DATABASE =================
with open("dna_database.txt") as f:
    lines = f.readlines()

chunk = len(lines) // size
start = rank * chunk
end = start + chunk if rank != size-1 else len(lines)

ids = []
names = []
dna_strings = []

for line in lines[start:end]:
    i, name, dna = line.strip().split(",")
    ids.append(i)
    names.append(name)
    dna_strings.append(dna)

dna_np = np.array([list(seq.encode()) for seq in dna_strings], dtype=np.uint8)
mask_np = np.array(list(DNA_MASK.encode()), dtype=np.uint8)
results = np.zeros(len(dna_np), dtype=np.int32)

# ================= CUDA EXECUTION =================
threads = 128
blocks = (len(dna_np) + threads - 1) // threads
dna_kernel[blocks, threads](dna_np, mask_np, results)

# ================= GATHER RESULTS =================
local_data = [(ids[i], names[i], dna_strings[i], int(results[i])) for i in range(len(results))]
all_data = comm.gather(local_data, root=0)

# ================= TOP 5 MATCHES =================
if rank == 0:
    combined = [item for sublist in all_data for item in sublist]
    # Sort by matches descending
    combined_sorted = sorted(combined, key=lambda x: x[3], reverse=True)
    top5 = [x for x in combined_sorted if x[3] > 0][:5]

    print("===== TOP 5 MATCHES =====")
    for idx, (id_, name, seq, count) in enumerate(top5, start=1):
        print(f"{idx}. ID: {id_} | Name: {name} | Matches: {count}")
        print(f"Sequence: {seq}\n")



Overwriting mpi_cuda_top5.py


In [66]:
!mpirun --allow-run-as-root --oversubscribe -np 2 python mpi_cuda_top5.py ACTG


===== TOP 5 MATCHES =====
1. ID: 8 | Name: Person_8 | Matches: 1
Sequence: TACTGCCTCATC

2. ID: 73 | Name: Person_73 | Matches: 1
Sequence: TCAACTGGCTCC

3. ID: 111 | Name: Person_111 | Matches: 1
Sequence: CACTGGTATGGT

4. ID: 141 | Name: Person_141 | Matches: 1
Sequence: AGAAGACTGATA

5. ID: 187 | Name: Person_187 | Matches: 1
Sequence: ACTGTCCTTGCG

