In [14]:
import os
from pathlib import Path
import numpy as np
import pickle as pk
import h5py
import gc


######################################################################################
######################################################################################

def save_to_hdf5(data_input: dict, hdf5_path: Path, gzip: bool = True) -> Path:
    data = data_input
    str_dt = h5py.string_dtype(encoding="utf-8")

    with h5py.File(hdf5_path, "w") as h5f:
        metadata_group = h5f.create_group("metadata")

        loci_array = np.array(data["loci"], dtype=str_dt)
        metadata_group.create_dataset("loci", data=loci_array)

        pheno_names_array = np.array(data["phenotype_names"], dtype=str_dt)
        metadata_group.create_dataset("phenotype_names", data=pheno_names_array)

        strains_group = h5f.create_group("strains")

        for idx, strain_id in enumerate(data["strain_names"]):
            strain_grp = strains_group.create_group(strain_id)

            pheno = np.array(data["phenotypes"][idx], dtype=np.float64)
            strain_grp.create_dataset("phenotype", data=pheno)

            genotype = np.array(data["genotypes"][idx], dtype=np.int8)
            strain_grp.create_dataset(
                "genotype",
                data=genotype,
                chunks=True,
                compression="gzip" if gzip else None,
            )

        print(f"{hdf5_path} generated from {data_input}.")

    return hdf5_path
out_dict={}


In [15]:

######################################################################################
######################################################################################
# Read phenotype file
phen_file = open('test_sim_WF_1kbt_100kups_5mb_p.txt', 'r')
phens = phen_file.read().split('\n')
phens = [x.split() for x in phens if x]  # Skip empty lines

# Extract phenotype information
out_dict = {}
out_dict['phenotype_names'] = phens[0][1:]  # Extract header of pheno names from first row
out_dict['strain_names'] = [x[0] for x in phens[1:] if x]  # Strain names from first column
out_dict['phenotypes'] = []

# Convert phenotypes to float, handling NA values
for x in phens[1:]:
    if not x:  # Skip empty lines
        continue
    row_phenos = []
    for y in x[1:]:
        if y == 'NA':
            row_phenos.append(0)  # Or use None/np.nan if preferred
        else:
            row_phenos.append(float(y))
    out_dict['phenotypes'].append(row_phenos)

phen_file.close()
del phens
gc.collect()

0

In [3]:
len(out_dict['phenotypes'][1])

25

In [16]:

# Read genotype file - CSV format with headers
genotype_file = open('test_sim_WF_1kbt_100kups_5mb_g.txt', 'r')

#Read and process header line for locus names
header = genotype_file.readline().strip()
out_dict['loci'] = header.split(',')

# Create a temporary directory for chunk files
temp_dir = "temp_genotype_chunks"
os.makedirs(temp_dir, exist_ok=True)

# Process genotypes in chunks
new_coding_dict = {'0': [1, 0], '1': [0, 1]}
chunk_size = 10000  # Adjust based on your memory constraints
chunk_files = []
current_chunk = []
chunk_counter = 0

for line in genotype_file:
    line = line.strip()
    if not line:
        continue

    parts = line.split(',')
    if len(parts) <= 1:
        continue

    # Process genotypes for this individual
    ind_genotypes = []
    for geno in parts[1:]:
        if geno.strip() in new_coding_dict:
            ind_genotypes.append(new_coding_dict[geno.strip()])
        else:
            ind_genotypes.append([0, 0])

    current_chunk.append(ind_genotypes)

    # If chunk is full, write to disk and clear memory
    if len(current_chunk) >= chunk_size:
        chunk_filename = os.path.join(temp_dir, f"genotype_chunk_{chunk_counter}.pk")
        with open(chunk_filename, 'wb') as f:
            pk.dump(current_chunk, f)

        chunk_files.append(chunk_filename)
        chunk_counter += 1
        current_chunk = []
        gc.collect()  # Force garbage collection

# Save last chunk if not empty
if current_chunk:
    chunk_filename = os.path.join(temp_dir, f"genotype_chunk_{chunk_counter}.pk")
    with open(chunk_filename, 'wb') as f:
        pk.dump(current_chunk, f)
    chunk_files.append(chunk_filename)

# Close genotype file to free resources
genotype_file.close()

In [24]:
import sys
import numpy as np
from operator import itemgetter

def list_memory_objects(limit=20):
    """
    List objects in memory along with their size.

    Args:
        limit: Number of largest objects to display
    """
    # Get all objects in the global namespace
    objects = globals().items()

    # Calculate size of each object
    object_sizes = []
    for name, obj in objects:
        # Skip modules, functions, and other non-data objects
        if name.startswith('__') or callable(obj) or name == 'list_memory_objects':
            continue

        try:
            size = sys.getsizeof(obj)
            # For numpy arrays, get actual memory usage
            if isinstance(obj, np.ndarray):
                size = obj.nbytes
            # For lists, add size of elements
            elif isinstance(obj, list) and obj:
                size += sum(sys.getsizeof(i) for i in obj)
            # For dictionaries, add size of keys and values
            elif isinstance(obj, dict) and obj:
                size += sum(sys.getsizeof(k) + sys.getsizeof(v) for k, v in obj.items())

            object_sizes.append((name, type(obj).__name__, size, size / (1024 * 1024)))
        except:
            object_sizes.append((name, type(obj).__name__, 0, 0))

    # Sort by size (largest first)
    object_sizes.sort(key=itemgetter(2), reverse=True)

    # Print results
    print(f"{'Name':<20} {'Type':<15} {'Size (bytes)':<15} {'Size (MB)':<10}")
    print("-" * 60)
    for name, type_name, size, size_mb in object_sizes[:limit]:
        print(f"{name:<20} {type_name:<15} {size:<15} {size_mb:<10.2f}")

    return object_sizes

# Example usage
list_memory_objects()

Name                 Type            Size (bytes)    Size (MB) 
------------------------------------------------------------
_                    list            8527459484      8132.42   
_23                  list            8527459484      8132.42   
chunk_data           list            8461130104      8069.16   
ind_genotypes        list            8000984         7.63      
parts                list            5002505         4.77      
out_dict             dict            2405175         2.29      
_20                  dict            2405175         2.29      
_oh                  dict            2313336         2.21      
Out                  dict            2313336         2.21      
header               str             788935          0.75      
line                 str             200049          0.19      
_ih                  list            14499           0.01      
In                   list            14499           0.01      
_i10                 str             2157  

[('_', 'list', 8527459484, 8132.419094085693),
 ('_23', 'list', 8527459484, 8132.419094085693),
 ('chunk_data', 'list', 8461130104, 8069.162467956543),
 ('ind_genotypes', 'list', 8000984, 7.630332946777344),
 ('parts', 'list', 5002505, 4.770760536193848),
 ('out_dict', 'dict', 2405175, 2.2937536239624023),
 ('_20', 'dict', 2405175, 2.2937536239624023),
 ('_oh', 'dict', 2313336, 2.2061691284179688),
 ('Out', 'dict', 2313336, 2.2061691284179688),
 ('header', 'str', 788935, 0.7523870468139648),
 ('line', 'str', 200049, 0.1907815933227539),
 ('_ih', 'list', 14499, 0.013827323913574219),
 ('In', 'list', 14499, 0.013827323913574219),
 ('_i10', 'str', 2157, 0.0020570755004882812),
 ('_i16', 'str', 1768, 0.00168609619140625),
 ('x', 'list', 1766, 0.0016841888427734375),
 ('_i24', 'str', 1720, 0.00164031982421875),
 ('_i1', 'str', 1519, 0.0014486312866210938),
 ('_i2', 'str', 1519, 0.0014486312866210938),
 ('_i14', 'str', 1519, 0.0014486312866210938),
 ('_i15', 'str', 1046, 0.000997543334960937

In [None]:
# Now combine all chunks for final output
out_dict['genotypes'] = []
for chunk_file in chunk_files:
    with open(chunk_file, 'rb') as f:
        chunk_data = pk.load(f)
        out_dict['genotypes'].extend(chunk_data)

    # Delete chunk file after reading to free disk space
    os.remove(chunk_file)

# Remove temporary directory
os.rmdir(temp_dir)

In [None]:
save_to_hdf5(out_dict, 'test_sim_WF_1kbt_100kups_5mb_full.h5')


In [12]:
import h5py
from pathlib import Path

def inspect_hdf5_file(hdf5_path: Path):
    """
    Safely inspect an HDF5 file to check for structural issues
    """
    try:
        with h5py.File(hdf5_path, "r") as h5f:
            # Print the top-level keys
            print("Top-level groups/datasets:", list(h5f.keys()))

            # Examine each top-level key without fully loading the data
            for key in h5f.keys():
                try:
                    item = h5f[key]
                    if isinstance(item, h5py.Group):
                        print(f"Group '{key}' contains: {list(item.keys())}")
                    elif isinstance(item, h5py.Dataset):
                        print(f"Dataset '{key}' has shape: {item.shape} and dtype: {item.dtype}")
                except Exception as e:
                    print(f"Error accessing '{key}': {str(e)}")

            return True
    except Exception as e:
        print(f"Error opening file: {str(e)}")
        return False

In [None]:
in_data = out_dict

out_dict_test = {}
out_dict_train = {}

categories_to_stratefy = ['phenotypes', 'genotypes', 'strain_names']
categories_to_copy = [x for x in in_data.keys() if x not in categories_to_stratefy]

train_length = round(len(in_data['strain_names'])*0.85)

#train set
for x in categories_to_copy:
 out_dict_train[x] = in_data[x]

for x in categories_to_stratefy:
 out_dict_train[x] = in_data[x][:train_length]

#pk.dump(out_dict_train, open('gpatlas/' + file_prefix + '_train.pk','wb'))
save_to_hdf5(out_dict_train, 'test_sim_WF_1kbt_100kups_5mb_train.h5')

del(out_dict_train)

#test set
for x in categories_to_copy:
 out_dict_test[x] = in_data[x]

for x in categories_to_stratefy:
 out_dict_test[x] = in_data[x][train_length:]

#pk.dump(out_dict_test, open('gpatlas/' + file_prefix + '_test.pk','wb'))
save_to_hdf5(out_dict_test, 'test_sim_WF_1kbt_100kups_5mb_test.h5')

del(out_dict_test)