In [1]:
#import libaries

import pyBigWig
import scipy.io as sio
import numpy as np
import math 
import matplotlib.pyplot as plt
import os
import pandas as pd
from heapq import nlargest
import copy
import matplotlib.gridspec as gridspec
import pandas as pd
import pickle
import seaborn as sns
from config_and_print import methy_directory, filtered_list, chrom_file, resolutions, output_directory

# Ensure resolutions is treated as a tuple or list of strings
if isinstance(resolutions, str):
    resolutions = (resolutions,)

# Print resolutions for debugging
print(f"Resolutions from config: {resolutions}")

# Extract resolution value and label from the resolutions string
resolution_str = resolutions[0]

# Debug print to check the value of resolution_str
print(f"Extracted resolution string: {resolution_str}")


#set path and resolution
path = '/home/dwk681/workspace/Genomic_Files/BigWigFiles/hg19/'
resolution = 1_000_000  

# Define the path file with prefixes and colors in the following form
#1       sc1.ACTTGA      red
#2       sc1.GCCAAT      red
#3       sc1.TAGCTT      red
#4       sc10.TAGCTT     blue
#
filename = '/home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/name.order.HCG_methy.with_color.txt'

# Initialize an empty dictionary to store cell ID and color
cell_color_dict = {}

# Open and read the file
with open(filename, 'r') as file:
    for line in file:
        # Split the line into parts
        parts = line.strip().split()
        # Extract cell ID and color
        cell_id = parts[1]
        color = parts[2]
        # Store in dictionary
        cell_color_dict[cell_id] = color

# Define the path to the tensor sample order file
#This file contains the prefixes in the form
#sc11.ACTTGA
#sc11.CGATGT
#sc11.GCCAAT
#sc11.TAGCTT
#sc12.ACTTGA
#
tensor_order_filename = '/home/dwk681/workspace/hypermatrix_test/hypermatrix/projects/single_cell_files/filtered_bam_list.txt'

# Initialize a list to store the 1s and 0s
color_vector = []

# Open and read the tensor sample order file
with open(tensor_order_filename, 'r') as file:
    for line in file:
        sample_id = line.strip()  # Remove any trailing newlines or spaces
        if sample_id in cell_color_dict and cell_color_dict[sample_id] == 'red':
            color_vector.append(1)
        else:
            color_vector.append(0)

# Output the color vector to check
print(len(color_vector))

# Create a mapping dictionary
color_mapping = {
    'red': 'imr90',
    'blue': 'gm12878'
}

# Update the dictionary using the mapping
updated_cell_color_dict = {key: color_mapping[value] for key, value in cell_color_dict.items()}


In [3]:
#a mappability experiment is an in silico experiment that reads throught a reference genome and calculates 
#how unique each section of the genome is. The more unique it is, the higher mappability score it acheives.

bigwig_file = path + "mappability_hg19_wgEncodeCrgMapabilityAlign36mer.bigWig"

# Open the BigWig file
bw = pyBigWig.open(bigwig_file)

# Define the chromosomes you want to analyze
chromosomes = ['chr' + str(i) for i in range(1, 23)] 

# Define the threshold for removing bins based on average mappability
threshold = 0.1  

# Create a dictionary to store the bin indices to remove for each chromosome
bins_to_remove = {}

# Loop through each chromosome
for chrom in chromosomes:
    chrom_size = bw.chroms(chrom)

    if chrom_size is None:
        print(f"Chromosome {chrom} not found in the BigWig file.")
        continue

    # Calculate the number of bins based on the specified resolution
    num_bins = math.ceil(chrom_size / resolution) #last bin may not be of size resolution

    # Create lists to store bin indices to remove
    remove_indices = []

    # Calculate average mappability for each bin
    for i in range(num_bins):
        # Determine the start and end positions of the bin
        start = i * resolution
        end = min((i + 1) * resolution, chrom_size)  # to account for last bin which may be incomplete

        # Extract the mappability values for the bin
        values = np.nan_to_num(bw.values(chrom, start, end))

        # Calculate the average mappability score for the bin
        avg_mappability = np.mean(values)

        # Check if the average mappability is below the threshold
        if avg_mappability < threshold:
            remove_indices.append(i)

    # Store the bin indices to remove for this chromosome
    bins_to_remove[chrom] = remove_indices

# Close the BigWig file
bw.close()

# Convert the lists in bins_to_remove to numpy arrays
for chrom in bins_to_remove:
    bins_to_remove[chrom] = np.array(bins_to_remove[chrom])

# Save the dictionary as an .npz file
np.savez(f'/home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/250Kb/tensor_250Kb_dir/bins_to_remove_res{resolution}.npz', **bins_to_remove)


129
