In [2]:
import numpy as np
import pandas as pd
import json
import time
from ardal import Ardal
import _ardal
import random


def randcoords(npy_matrix, ncoords=10):
    # npy_matrix = np.load("./data/test_csv_matrix.npy")
    # test_matrix = np.array([[0, 1, 1, 0],[1, 0, 0, 1], [0, 1, 0, 1], [1, 0, 0, 1]])

    # coords = np.array([[0, 0], [0, 1], [0, 2], [0, 3], [2, 1]], dtype=int)

    n_rows, n_cols = len(npy_matrix)-1, len(npy_matrix[0])-1
    coords = []

    # print(npy_matrix, n_rows, n_cols)


    for _ in range(ncoords):
        col = random.randint(0, n_cols - 1)
        row = random.randint(0, n_rows - 1)
        coords.append([row, col])

    return np.array(coords, dtype=int)


def getCoords(json_headers, allele_ids=None, guids=None):

    if allele_ids == None and guids == None:
        print("Please provide a list of guids/allele ids")
      
    if guids == None:
        guids = json_headers["guids"]
    
    if allele_ids == None:
        allele_ids = json_headers["alleles"]
        
    coords = []
    for aid in allele_ids:

        if aid not in json_headers["alleles"]:
            raise ValueError(f"Allele ID '{aid}' not found.")

        for guid in guids:
            if guid not in json_headers["guids"]:
                raise ValueError(f"Sample GUID '{guid}' not found.")
        
            coords.append(encodeCoord(json_headers, [guid, aid]))

    return coords


def encodeCoord(json_headers, coord):
    return [json_headers["guids"].index(coord[0]), json_headers["alleles"].index(coord[1])]

def decodeCoord(json_headers, coord):
    return [ json_headers["guids"][coord[0]], json_headers["alleles"][coord[1]]]

def encodeGuid(json_headers, guid):
    return json_headers["guids"].index(guid)

def decodeGuid(json_headers, row_coord):
    return json_headers["guids"][row_coord]

def encodeAllele(json_headers, allele):
    return json_headers["alleles"].index(allele)

def decodeAllele(json_headers, col_coord):
    return json_headers["alleles"][col_coord]

In [3]:
def generateMatrix(sample_n=1000, allele_n=50000):

    ## 1. Create the Matrix (random 0s and 1s)
    npy_matrix = np.ascontiguousarray(np.random.randint(0, 2, size=(sample_n, allele_n), dtype='uint8'))  # Adjust as needed for your desired distribution

    ## 2. GUIDs (Row Labels) - Simple numerical IDs
    guids = [f"sample_{i}" for i in np.arange(sample_n)]

    ## 3. Alleles (Column Labels) - Simple numerical IDs
    alleles = [f"allele_{i}" for i in np.arange(allele_n)]

    headers_json = {"guids" : guids, "alleles" : alleles}

    return npy_matrix, headers_json

import numpy as np


def generateSparseMatrix(sample_n=1000, allele_n=50000, sparsity=0.05):
    """Generates a sparse binary matrix with a specified sparsity.

    Args:
        sample_n: Number of rows (samples).
        allele_n: Number of columns (alleles).
        sparsity: Target proportion of 1s in the matrix (e.g., 0.05 for 5%).

    Returns:
        A tuple containing:
            - npy_matrix: A NumPy array representing the sparse matrix.
            - headers_json: A dictionary containing row and column labels.
    """

    npy_matrix = np.zeros((sample_n, allele_n), dtype='uint8')

    for i in range(sample_n):
        # Calculate the number of 1s for the current row based on sparsity
        num_ones = int(sparsity * allele_n)  # Or round if needed

        # Generate random indices for the 1s
        indices = np.random.choice(allele_n, size=num_ones, replace=False)

        # Set the chosen indices to 1
        npy_matrix[i, indices] = 1


    guids = [f"sample_{i}" for i in np.arange(sample_n)]
    alleles = [f"allele_{i}" for i in np.arange(allele_n)]

    headers_json = {"guids": guids, "alleles": alleles}

    return npy_matrix, headers_json


import numpy as np

def generateClusteredSparseMatrix(sample_n=1000, allele_n=50000, sparsity=0.05, n_clusters=10, cluster_spread=0.1):
    """Generates a sparse binary matrix with clustered data based on Hamming distance.

    Args:
        sample_n: Number of rows (samples).
        allele_n: Number of columns (alleles).
        sparsity: Target proportion of 1s in the matrix.
        n_clusters: Number of clusters to generate.
        cluster_spread: Controls the Hamming distance spread within clusters (higher values lead to more spread).

    Returns:
        A tuple containing:
            - npy_matrix: NumPy array representing the sparse matrix.
            - headers_json: Dictionary containing row and column labels.
    """

    npy_matrix = np.zeros((sample_n, allele_n), dtype='uint8')

    # Generate centroids
    centroids = []
    for _ in range(n_clusters):
        centroid = np.random.binomial(1, sparsity, size=allele_n)  # Sparse centroid
        centroids.append(centroid)

    # Assign samples to clusters and introduce variations
    samples_per_cluster = sample_n // n_clusters
    for i in range(n_clusters):
        start_index = i * samples_per_cluster
        end_index = min((i + 1) * samples_per_cluster, sample_n) 

        for j in range(start_index, end_index):
            # Introduce variations based on the centroid
            new_sample = centroids[i].copy()
            flip_indices = np.random.choice(allele_n, size=int(cluster_spread * allele_n), replace=False)
            new_sample[flip_indices] = 1 - new_sample[flip_indices]  # Flip bits
            npy_matrix[j] = new_sample



    guids = [f"sample_{i}" for i in np.arange(sample_n)]
    alleles = [f"allele_{i}" for i in np.arange(allele_n)]
    headers_json = {"guids": guids, "alleles": alleles}

    return npy_matrix, headers_json

In [None]:
df = pd.read_csv("/home/amorris/BioInf/Ardal_WD/databases/usher_barcodes.csv", index_col=0)
matrix_array = df.values.astype(np.uint8)

np.save(f"usher_matrix.npy", matrix_array)
headers = {
            "index": df.index.tolist(),
            "columns": df.columns.tolist()
        }

with open(f"usher_headers.json", "w") as f:
    json.dump(headers, f, indent=4)

In [None]:
file_path = ["./data/Cparv_headers.json", "./data/Cparv_matrix.npy"]
# bg_ard = Ardal(file_path)

npy_matrix = np.ascontiguousarray(np.load(file_path[1]))
with open(file_path[0], 'r') as fin:
    json_headers = json.load(fin)

print(len(json_headers["alleles"]))

allele_ids = ["allele2", "allele7", "allele12"]
guids = ["guid1", "guid5", "guid6", "guid10"]
coords = getCoords(json_headers, allele_ids=allele_ids, guids=guids)

aid_guid_pairs = [decodeCoord(json_headers, coord) for coord in coords]

bit_array = ardal.allele_set_membership(npy_matrix, coords)

result = [aid_guid_pairs[i] for i, b in enumerate(bit_array) if b]

In [None]:
npy_matrix, json_headers = generateMatrix(sample_n=100000, allele_n=5000)

In [None]:
guids = list(np.random.choice(json_headers["guids"], 5))
allele_ids = list(np.random.choice(json_headers["alleles"], 5))

coords = getCoords(json_headers, allele_ids=allele_ids, guids=guids)
aid_guid_pairs = [decodeCoord(json_headers, coord) for coord in coords]

print(coords)

In [None]:
npy_matrix = np.load("./data/test_csv_matrix.npy")
# test_matrix = np.array([[0, 1, 1, 0],[1, 0, 0, 1], [0, 1, 0, 1], [1, 0, 0, 1]])
# coords = np.array([[0, 0], [0, 1], [0, 2], [0, 3], [2, 1]], dtype=int)
coords = randcoords(npy_matrix, 10)

In [None]:
def isolate_and_save(allele_matrix, output_prefix):
    """Isolates binary matrix, converts to uint8, and saves array and headers."""

    try:
        # Convert directly to uint8 NumPy array
        matrix_array = allele_matrix.values.astype(np.uint8)

        # Save NumPy array
        np.save(f"{output_prefix}_matrix.npy", matrix_array)

        # Store headers
        headers = {
            "guids": allele_matrix.index.tolist(),
            "alleles": allele_matrix.columns.tolist()
        }

        with open(f"{output_prefix}_headers.json", "w") as f:
            json.dump(headers, f, indent=4)

        print(f"Matrix saved as '{output_prefix}_matrix.npy', headers as '{output_prefix}_headers.json'")

    except Exception as e:
        print(f"An error occurred: {e}")

df = pd.read_csv("./data/Cparv_matrix.csv", header=0, index_col=0)
isolate_and_save(df, "Cparv")

In [None]:
import random

file_path = ["./data/test_csv_headers.json", "./data/test_csv_matrix.npy"]
# bg_ard = Ardal(file_path)

npy_matrix = np.load(file_path[1])
with open(file_path[0], 'r') as fin:
    json_headers = json.load(fin)

allele_ids = ["allele2", "allele7", "allele12"]
coords = getCoords(json_headers, allele_ids=allele_ids)

In [None]:
print(npy_matrix)
print(coords)
s = time.time()
print(np.array([npy_matrix[row, col] for row, col in coords]))
e = time.time()
print(s-e)

In [None]:
# npy_matrix, json_headers = generateMatrix(sample_n=10000, allele_n=500)
npy_matrix = np.ascontiguousarray(np.load("./data/BG_pan_matrix.npy"))
print(len(npy_matrix[0])*len(npy_matrix))

coords = randcoords(npy_matrix, 10)
print(coords)

In [None]:
import sys
import numpy as np
import json
import _ardal

npy_matrix = np.ascontiguousarray(np.load("../data/usher_matrix.npy"))
with open("../data/usher_headers.json", 'r') as fin:
    json_headers = json.load(fin)

print(npy_matrix.size * npy_matrix.itemsize)

# s = time.time()
ardmat = _ardal.AlleleMatrix(npy_matrix)
# result = ardmat.hamming()
# result = ardal_mat.access(coords)
# result = ardal.accessAlleleMatrix(npy_matrix, coords)
# e = time.time()
# print(npy_matrix.dtype, result)
# print(e-s)

## uint8 provides a massive speedup compared to int64 or int8
# packed = npy_matrix.astype('int8')
# print(packed.size * packed.itemsize)

# s = time.time()
# result = ardal.accessAlleleMatrix(packed, coords)
# e = time.time()
# print(packed.dtype, result)
# print(e-s)

neigh = ardmat.neighbourhoodSIMD(41, 2)
neigh

In [None]:
from scipy.spatial.distance import pdist, squareform

ard_d_out = np.array(squareform(result))
ard_d_df = pd.DataFrame(ard_d_out, columns=json_headers["guids"], index=json_headers["guids"])
ard_d_df.to_csv("usher_hamming.csv")

In [6]:
import _ardal
import time
import numpy as np
import json
from scipy.spatial.distance import pdist, squareform
import requests


# npy_matrix = np.ascontiguousarray(np.load("./data/usher_matrix.npy"))
# with open("./data/usher_headers.json", 'r') as fin:
#     json_headers = json.load(fin)
# npy_matrix, json_headers = generateClusteredSparseMatrix(sample_n=200000, allele_n=5000, sparsity=0.01, cluster_spread=0.0001, n_clusters=100)
# npy_matrix = np.ascontiguousarray(\
#     [[1, 1, 1, 1, 1], 
#      [0, 1, 1, 1, 1], 
#      [0, 0, 1, 1, 1], 
#      [0, 0, 0, 1, 1], 
#      [0, 0, 0, 0, 1], 
#      [0, 0, 0, 0, 0], 
#      [0, 1, 0, 1, 0], 
#      [1, 0, 1, 0, 1]])


_n = len(npy_matrix)
print(_n, _n*(_n-1)/2)

ardmat = _ardal.AlleleMatrix(npy_matrix)
ardmat.hamming()

200000 19999900000.0


MemoryError: Unable to allocate 74.5 GiB for an array with shape (19999900000,) and data type int32

In [None]:
from ardal import Ardal

data = ["/home/amorris/BioInf/Ardal/data/Cparv_matrix.npy", "/home/amorris/BioInf/Ardal/data/Cparv_headers.json"]
ard = Ardal(data)

In [None]:
ard.neighbourhood("SRR6147472_UKP3", 100, simd=False)

In [None]:
n = 10
guid = "XCH.1"
e_guid = encodeGuid(json_headers, guid)

s = time.time()
simd_result = ardmat.neighbourhoodSIMD(e_guid, n)
e = time.time()
simd_dr = sorted([[decodeGuid(json_headers, r), c] for r, c in simd_result], key=lambda x: x[0])
print(e-s, len(simd_dr))

# s = time.time()
# result = ardmat.neighbourhood(e_guid, n)
# e = time.time()
# print(e-s, [decodeGuid(json_headers, r) for r in result])

# s = time.time()
# cw_result = requests.get(f"http://localhost:5000/neighbours/{guid}/{n+10}").json()
# e = time.time()
# print(e-s, sorted([[i, int(j)] for i, j in cw_result if int(j) <= n], key=lambda x: x[0]))

# ard_d_df = pd.DataFrame(np.array(squareform(result)), columns=json_headers["guids"], index=json_headers["guids"])
# ard_d_df.to_csv("./clustered_sparse_sim.csv")

In [None]:
import requests

s = time.time()
cw_result = requests.get(f"http://localhost:5000/neighbours/{guid}/{n-1}").json()
e = time.time()

print(e-s)

cw_dr = set(sorted([i for i, c in cw_result if int(c)<=n-1]))

intersection = cw_dr.intersection(simd_dr)
union = cw_dr.union(simd_dr)
symmetric_difference = cw_dr.symmetric_difference(simd_dr)

# print(f"Intersection: {intersection}")
# print(f"Union: {union}")
# print(f"Symmetric Difference: {symmetric_difference}")

# print(simd_dr)
print([(i, c) for i, c in cw_result if int(c)<=n-1])

In [None]:
# for i, c in cw_result:
#     ard_d = ard_d_df[guid][i]
#     if int(ard_d) != int(c):
#         print(i, c, ard_d)

ard_d_df[guid]['A.1']

In [None]:
ardmat = ardal.AlleleMatrix(npy_matrix)

s = time.time()
# result = ardmat.hamming()
result = ardmat.neighbourhood(6969, 175)

e = time.time()

print(e-s, len(result))