In [1]:
############################################
# imports
############################################

import pyBigWig
import scipy.io as sio
import numpy as np
import math 
import matplotlib.pyplot as plt
import os
import pandas as pd
from heapq import nlargest
import copy
import matplotlib.gridspec as gridspec
import pandas as pd
import pickle
import seaborn as sns
from config_and_print import methy_directory, filtered_list, chrom_file, resolutions, output_directory, mappability_threshold

# Ensure resolutions is treated as a tuple or list of strings
if isinstance(resolutions, str):
    resolutions = (resolutions,)

# Print resolutions for debugging
print(f"Resolutions from config: {resolutions}")

# Extract resolution value and label from the resolutions string
resolution_str = resolutions[0]

# Debug print to check the value of resolution_str
print(f"Extracted resolution string: {resolution_str}")

def parse_resolution(resolution_str):
    if ':' in resolution_str:
        resolution_value, resolution_label = resolution_str.split(':')
        try:
            resolution = int(resolution_value)
            return resolution, resolution_label
        except ValueError:
            raise ValueError(f"Resolution value should be an integer: '{resolution_value}' in '{resolution_str}'")
    else:
        raise ValueError(f"Invalid resolution format: '{resolution_str}'. Expected format 'value:label', e.g., '1000000:1Mb'.")

resolution, resolution_label = parse_resolution(resolution_str)

########################################################################
# create the cell type dictionary
# [TO DO] This needs to be replaced with SNPS code 
########################################################################
# Define the path file with prefixes and colors in the following form
#1       sc1.ACTTGA      red
#2       sc1.GCCAAT      red
#3       sc1.TAGCTT      red
#4       sc10.TAGCTT     blue
#
filename = '../../bin/name.order.HCG_methy.with_color.txt'

# Initialize an empty dictionary to store cell ID and color
cell_color_dict = {}

# Open and read the file
with open(filename, 'r') as file:
    for line in file:
        # Split the line into parts
        parts = line.strip().split()
        # Extract cell ID and color
        cell_id = parts[1]
        color = parts[2]
        # Store in dictionary
        cell_color_dict[cell_id] = color

# Define the path to the tensor sample order file
#This file contains the prefixes in the form
#sc11.ACTTGA
#sc11.CGATGT
#sc11.GCCAAT
#
tensor_order_filename = f'{output_directory}/filtered_bam_list.txt'

# Initialize a list to store the 1s and 0s
color_vector = []

# Open and read the tensor sample order file
with open(tensor_order_filename, 'r') as file:
    for line in file:
        sample_id = line.strip()  # Remove any trailing newlines or spaces
        if sample_id in cell_color_dict and cell_color_dict[sample_id] == 'red':
            color_vector.append(1)
        else:
            color_vector.append(0)

# Output the color vector to check
print(len(color_vector))

# Create a mapping dictionary
color_mapping = {
    'red': 'imr90',
    'blue': 'gm12878'
}

# Update the dictionary using the mapping
updated_cell_color_dict = {key: color_mapping[value] for key, value in cell_color_dict.items()}

#################################################################################
#create dark bins file if not already created
#################################################################################

# Check if the bins to remove file has already been created
bins_file_path = f'{output_directory}/bins_to_remove_res{resolution_label}.npz'
if os.path.exists(bins_file_path):
    print(f"{bins_file_path} already exists. Skipping computation.")
else:
    bigwig_file = "../../bin/softwarefiles/dark_regions_hg19.bigWig"
    # Open the BigWig file
    bw = pyBigWig.open(bigwig_file)

    # Define the chromosomes you want to analyze
    chromosomes = ['chr' + str(i) for i in range(1, 23)] 

    # Define the threshold for removing bins based on average mappability
    threshold = mappability_threshold

    # Create a dictionary to store the bin indices to remove for each chromosome
    bins_to_remove = {}

    # Loop through each chromosome
    for chrom in chromosomes:
        chrom_size = bw.chroms(chrom)

        if chrom_size is None:
            print(f"Chromosome {chrom} not found in the BigWig file.")
            continue

        # Calculate the number of bins based on the specified resolution
        num_bins = math.ceil(chrom_size / resolution) #last bin may not be of size resolution

        # Create lists to store bin indices to remove
        remove_indices = []

        # Calculate average mappability for each bin
        for i in range(num_bins):
            # Determine the start and end positions of the bin
            start = i * resolution
            end = min((i + 1) * resolution, chrom_size)  # to account for last bin which may be incomplete

            # Extract the mappability values for the bin
            values = np.nan_to_num(bw.values(chrom, start, end))

            # Calculate the average mappability score for the bin
            avg_mappability = np.mean(values)

            # Check if the average mappability is below the threshold
            if avg_mappability < threshold:
                remove_indices.append(i)

        # Store the bin indices to remove for this chromosome
        bins_to_remove[chrom] = remove_indices

    # Close the BigWig file
    bw.close()

    # Convert the lists in bins_to_remove to numpy arrays
    for chrom in bins_to_remove:
        bins_to_remove[chrom] = np.array(bins_to_remove[chrom])

    # Save the dictionary as an .npz file
    np.savez(bins_file_path, **bins_to_remove)
    print(f"Bins to remove file created and saved to {bins_file_path}")

    
#create a dictionary of the A/B compartment calls for the bulk data
bulk_data = {}
path_to_eigenvectors = '/home/dwk681/workspace/multi_omics_hic_clustering/processing_scripts/eigenvector/'

for i in range(1, 23):
    file = path_to_eigenvectors + f'res{resolution}_ch{i}_KR_eigenvector_GM12878.txt'
    key = os.path.splitext(os.path.basename(file))[0]  
    bulk_data[key] = pd.read_csv(file, header=None, names=['eigenvalue'])
    file = path_to_eigenvectors + f'res{resolution}_ch{i}_KR_eigenvector_IMR90.txt'
    key = os.path.splitext(os.path.basename(file))[0]  
    bulk_data[key] = pd.read_csv(file, header=None, names=['eigenvalue'])

bam_directory='/home/dwk681/workspace/cluster_cells_from_GSE189158_NOMe_HiC/filesFromCluster/bam'
methy_directory='/home/dwk681/workspace/cluster_cells_from_GSE189158_NOMe_HiC/filesFromCluster/bam/methylation/filter_low_qual'
software_directory='../../bin/softwarefiles'
chrom_file='../../bin/softwarefiles/hg19.autosome.chrom.sizes'
fragments_file='../../bin/softwarefiles/hg19_DpnII.txt'
output_directory='../../projects/single_cell_files'
hg19_fa_url='ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz'
filtered_list='../../projects/single_cell_files/filtered_bam_list.txt'
schicluster_env='schicluster2'
bisulfite_env='bisulfitehic27'
min_high_quality_reads='250000'
resolutions='1000000:1Mb'
impute='True'
cluster_compartments='False'
cumulant='False'
iterations='400'
chromosomes=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
dark_regions_hg19_url='https://www.encodeproject.org/files/ENCFF000EHJ/@@download/ENCFF000EHJ.bigWig'
mappability_threshold='0.6'
data_types=o

FileNotFoundError: [Errno 2] No such file or directory: '/home/dwk681/workspace/multi_omics_hic_clustering/processing_scripts/eigenvector/res1000000_ch1_KR_eigenvector_GM12878.txt'

129
