In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

edges = pd.read_table("../../Data/tables/T2.regulome.edges.txt")
nodes = pd.read_table("../../Data/tables/T3.regulome.nodes.txt")
nodes['chrm'] = nodes['chr']

# Load the precomputed downstream nodes from file
def load_precomputed_downstream_nodes(file):
    with open(file, 'rb') as f:
        downstream_map = pickle.load(f)
    return downstream_map

# Load precomputed downstream nodes
downstream_map = load_precomputed_downstream_nodes('../../Data/Precomputed_Downstream_Nodes.pkl')

# Define the GRN map and node list
all_nodes = nodes['name'].tolist()
N = len(all_nodes)
node_to_index = {node: idx for idx, node in enumerate(all_nodes)}

# Function to process a batch of files and generate multiclass embeddings
def process_batch(batch_files, batch_number, tf_match_results_dir, output_dir):
    tf_embeddings = {}  # Initialize embeddings dictionary for this batch

    with tqdm(total=len(batch_files), desc=f"Processing Batch {batch_number}") as pbar:
        for file in batch_files:
            individual_id = file.split('_')[2]  # Assuming file names have individual IDs
            individual_vector = np.zeros(N, dtype=int)  # Default value 0

            # Load the matching results for this individual
            file_path = os.path.join(tf_match_results_dir, file)
            df = pd.read_csv(file_path, sep='\t', header=None, names=['TF'], usecols=[8])

            # Set direct mutations (TF nodes) to 2
            for tf_node in df['TF'].unique():
                if tf_node in downstream_map:
                    tf_index = node_to_index.get(tf_node)
                    
                    # Set downstream nodes to 1
                    downstream_indices = [node_to_index[node] for node in downstream_map[tf_node] if node in node_to_index]
                    individual_vector[downstream_indices] = 1  # Downstream, set to 1
                    if tf_index is not None:
                        individual_vector[tf_index] = 2  # Direct match, set to 2

            # Store the embedding for this individual
            tf_embeddings[individual_id] = individual_vector

            # Update the progress bar
            pbar.update(1)

    # Convert the batch embeddings to a DataFrame and save the results
    tf_embeddings_df = pd.DataFrame.from_dict(tf_embeddings, orient='index', columns=all_nodes)
    batch_output_file = os.path.join(output_dir, f'tf_embeddings_batch_{batch_number}.csv')
    tf_embeddings_df.to_csv(batch_output_file)

    print(f"Batch {batch_number} saved to {batch_output_file}")

# Load and process the TF_Match_Results folder
tf_match_results_dir = "../../Data/TF_Match_Results"
tf_files = [file for file in os.listdir(tf_match_results_dir) if file.endswith(".bed")]

# Process the files in batches of 128
batch_size = 128
num_batches = len(tf_files) // batch_size + (1 if len(tf_files) % batch_size != 0 else 0)

# Create output directory if it doesn't exist
output_dir = '../../Data/TF_Embeddings_Multiclass'
os.makedirs(output_dir, exist_ok=True)

for batch_num in range(num_batches):
    start_index = batch_num * batch_size
    end_index = min(start_index + batch_size, len(tf_files))
    batch_files = tf_files[start_index:end_index]

    # Process the current batch of files and save after each batch
    process_batch(batch_files, batch_num + 1, tf_match_results_dir, output_dir)

print("All batches have been processed and saved.")

Processing Batch 1: 100%|█████████████████████| 128/128 [00:09<00:00, 13.79it/s]


Batch 1 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_1.csv


Processing Batch 2: 100%|█████████████████████| 128/128 [00:09<00:00, 13.33it/s]


Batch 2 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_2.csv


Processing Batch 3: 100%|█████████████████████| 128/128 [00:09<00:00, 13.42it/s]


Batch 3 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_3.csv


Processing Batch 4: 100%|█████████████████████| 128/128 [00:09<00:00, 13.43it/s]


Batch 4 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_4.csv


Processing Batch 5: 100%|█████████████████████| 128/128 [00:09<00:00, 13.13it/s]


Batch 5 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_5.csv


Processing Batch 6: 100%|█████████████████████| 128/128 [00:09<00:00, 13.34it/s]


Batch 6 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_6.csv


Processing Batch 7: 100%|█████████████████████| 128/128 [00:09<00:00, 13.39it/s]


Batch 7 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_7.csv


Processing Batch 8: 100%|█████████████████████| 128/128 [00:09<00:00, 13.24it/s]


Batch 8 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_8.csv


Processing Batch 9: 100%|█████████████████████| 128/128 [00:09<00:00, 13.37it/s]


Batch 9 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_9.csv


Processing Batch 10: 100%|████████████████████| 128/128 [00:09<00:00, 13.14it/s]


Batch 10 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_10.csv


Processing Batch 11: 100%|████████████████████| 128/128 [00:09<00:00, 13.30it/s]


Batch 11 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_11.csv


Processing Batch 12: 100%|████████████████████| 128/128 [00:09<00:00, 13.38it/s]


Batch 12 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_12.csv


Processing Batch 13: 100%|████████████████████| 128/128 [00:10<00:00, 12.54it/s]


Batch 13 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_13.csv


Processing Batch 14: 100%|████████████████████| 128/128 [00:10<00:00, 12.33it/s]


Batch 14 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_14.csv


Processing Batch 15: 100%|████████████████████| 128/128 [00:10<00:00, 12.22it/s]


Batch 15 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_15.csv


Processing Batch 16: 100%|████████████████████| 128/128 [00:10<00:00, 12.35it/s]


Batch 16 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_16.csv


Processing Batch 17: 100%|████████████████████| 128/128 [00:10<00:00, 12.33it/s]


Batch 17 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_17.csv


Processing Batch 18: 100%|████████████████████| 128/128 [00:10<00:00, 12.33it/s]


Batch 18 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_18.csv


Processing Batch 19: 100%|████████████████████| 128/128 [00:10<00:00, 12.68it/s]


Batch 19 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_19.csv


Processing Batch 20: 100%|██████████████████████| 26/26 [00:02<00:00, 12.33it/s]


Batch 20 saved to ../../Data/TF_Embeddings_Multiclass/tf_embeddings_batch_20.csv
All batches have been processed and saved.


In [10]:
file_path

'../../Data/TF_Match_Results/enhancer_matched_SSC08007.bed'

In [11]:
file = 'tf_matched_SSC08007.bed'
individual_id = file.split('_')[2]  # Assuming file names have individual IDs
individual_vector = np.zeros(N, dtype=int)  # Default value 0

# Load the matching results for this individual
file_path = os.path.join(tf_match_results_dir, file)
df = pd.read_csv(file_path, sep='\t', header=None, names=['TF'], usecols=[8])

In [14]:
# Set direct mutations (TF nodes) to 2
for tf_node in df['TF'].unique():
    if tf_node in downstream_map:
        tf_index = node_to_index.get(tf_node)
        
        # Set downstream nodes to 1
        downstream_indices = [node_to_index[node] for node in downstream_map[tf_node] if node in node_to_index]
        individual_vector[downstream_indices] = 1  # Downstream, set to 1
        if tf_index is not None:
            individual_vector[tf_index] = 2  # Direct match, set to 2


In [17]:
sum(individual_vector == 2)

424

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

# Load the precomputed downstream nodes from file
def load_precomputed_downstream_nodes(file):
    with open(file, 'rb') as f:
        downstream_map = pickle.load(f)
    return downstream_map

# Load precomputed downstream nodes
downstream_map = load_precomputed_downstream_nodes('../../Data/Precomputed_Downstream_Nodes.pkl')

# Define the GRN map and node list
all_nodes = nodes['name'].tolist()
N = len(all_nodes)
node_to_index = {node: idx for idx, node in enumerate(all_nodes)}

# Function to process a batch of files and generate multiclass embeddings for enhancer matches
def process_enhancer_batch(batch_files, batch_number, enhancer_match_results_dir, output_dir):
    enhancer_embeddings = {}  # Initialize embeddings dictionary for this batch

    with tqdm(total=len(batch_files), desc=f"Processing Enhancer Batch {batch_number}") as pbar:
        for file in batch_files:
            individual_id = file.split('_')[2]  # Assuming file names have individual IDs
            individual_vector = np.zeros(N, dtype=int)  # Default value is 0

            # Load the matching results for this individual
            file_path = os.path.join(enhancer_match_results_dir, file)
            df = pd.read_csv(file_path, sep='\t', header=None, names=['Enhancer'], usecols=[8])

            # Process each enhancer node in the file
            for enhancer_node in df['Enhancer'].unique():
                if enhancer_node in downstream_map:
                    # Direct match (mutation): Set to 2
                    enhancer_index = node_to_index.get(enhancer_node)
                    if enhancer_index is not None:
                        individual_vector[enhancer_index] = 2
                        print(f"Direct match found for {enhancer_node}, setting index {enhancer_index} to 2")

                    # Downstream nodes: Set to 1, but only if not already 2
                    downstream_indices = [node_to_index[node] for node in downstream_map[enhancer_node] if node in node_to_index]
                    for idx in downstream_indices:
                        if individual_vector[idx] != 2:  # Only set to 1 if not already 2
                            individual_vector[idx] = 1
                    print(f"Downstream nodes for {enhancer_node}: {downstream_map[enhancer_node]} -> Setting indices {downstream_indices} to 1 (if not already 2)")

            # Store the embedding for this individual
            enhancer_embeddings[individual_id] = individual_vector

            # Update the progress bar
            pbar.update(1)

    # Convert the batch embeddings to a DataFrame and save the results
    enhancer_embeddings_df = pd.DataFrame.from_dict(enhancer_embeddings, orient='index', columns=all_nodes)
    batch_output_file = os.path.join(output_dir, f'enhancer_embeddings_batch_{batch_number}.csv')
    enhancer_embeddings_df.to_csv(batch_output_file)

    print(f"Batch {batch_number} saved to {batch_output_file}")

# Load and process the Enhancer_Match_Results folder
enhancer_match_results_dir = "../../Data/Enhancer_Match_Results"
enhancer_files = [file for file in os.listdir(enhancer_match_results_dir) if file.endswith(".bed")]

# Process the files in batches of 128
batch_size = 128
num_batches = len(enhancer_files) // batch_size + (1 if len(enhancer_files) % batch_size != 0 else 0)

# Create output directory if it doesn't exist
output_dir = '../../Data/Enhancer_Embeddings_Multiclass'
os.makedirs(output_dir, exist_ok=True)

# Loop over the batches and process them
for batch_num in range(num_batches):
    start_index = batch_num * batch_size
    end_index = min(start_index + batch_size, len(enhancer_files))
    batch_files = enhancer_files[start_index:end_index]

    # Process the current batch of files and save after each batch
    process_enhancer_batch(batch_files, batch_num + 1, enhancer_match_results_dir, output_dir)

print("All enhancer batches have been processed and saved.")


Processing Enhancer Batch 1: 100%|███████████| 128/128 [00:00<00:00, 135.10it/s]


Batch 1 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_1.csv


Processing Enhancer Batch 2: 100%|███████████| 128/128 [00:00<00:00, 186.14it/s]


Batch 2 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_2.csv


Processing Enhancer Batch 3: 100%|███████████| 128/128 [00:00<00:00, 144.32it/s]


Batch 3 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_3.csv


Processing Enhancer Batch 4: 100%|███████████| 128/128 [00:00<00:00, 178.59it/s]


Batch 4 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_4.csv


Processing Enhancer Batch 5: 100%|███████████| 128/128 [00:00<00:00, 200.62it/s]


Batch 5 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_5.csv


Processing Enhancer Batch 6: 100%|███████████| 128/128 [00:00<00:00, 156.49it/s]


Batch 6 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_6.csv


Processing Enhancer Batch 7: 100%|███████████| 128/128 [00:00<00:00, 164.18it/s]


Batch 7 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_7.csv


Processing Enhancer Batch 8: 100%|███████████| 128/128 [00:00<00:00, 167.35it/s]


Batch 8 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_8.csv


Processing Enhancer Batch 9: 100%|███████████| 128/128 [00:00<00:00, 137.34it/s]


Batch 9 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_9.csv


Processing Enhancer Batch 10: 100%|██████████| 128/128 [00:00<00:00, 149.28it/s]


Batch 10 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_10.csv


Processing Enhancer Batch 11: 100%|██████████| 128/128 [00:00<00:00, 130.47it/s]


Batch 11 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_11.csv


Processing Enhancer Batch 12: 100%|██████████| 128/128 [00:01<00:00, 124.29it/s]


Batch 12 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_12.csv


Processing Enhancer Batch 13: 100%|██████████| 128/128 [00:00<00:00, 151.71it/s]


Batch 13 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_13.csv


Processing Enhancer Batch 14: 100%|██████████| 128/128 [00:00<00:00, 165.16it/s]


Batch 14 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_14.csv


Processing Enhancer Batch 15: 100%|██████████| 128/128 [00:00<00:00, 141.19it/s]


Batch 15 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_15.csv


Processing Enhancer Batch 16: 100%|██████████| 128/128 [00:00<00:00, 173.01it/s]


Batch 16 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_16.csv


Processing Enhancer Batch 17: 100%|██████████| 128/128 [00:00<00:00, 179.42it/s]


Batch 17 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_17.csv


Processing Enhancer Batch 18: 100%|██████████| 128/128 [00:00<00:00, 175.84it/s]


Batch 18 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_18.csv


Processing Enhancer Batch 19: 100%|██████████| 128/128 [00:00<00:00, 168.09it/s]


Batch 19 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_19.csv


Processing Enhancer Batch 20: 100%|████████████| 26/26 [00:00<00:00, 196.07it/s]


Batch 20 saved to ../../Data/Enhancer_Embeddings_Multiclass/enhancer_embeddings_batch_20.csv
All enhancer batches have been processed and saved.
