In [1]:
import numpy as np
import os
import pandas as pd
import sys

### Specify input/output directories

In [2]:
# Specify LOCAL PATHS
data_path = "./Chromatin_modules/test_data"
output_path = "./PHM_test"

# Create output path if it does not exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Specify names of datasets to analyze
# PHM input/output file will be stored there
dataset = "test_data"

In [3]:
# Create dictionary with sample IDs per dataset
vcf_samples_dict = {
    dataset: pd.read_csv(
        os.path.join(data_path, "LCL_genotypes_chr22_samples.txt"),
        sep="\t",
        header=None,
    )
    #     for dataset in datasets
}

### Create necessary input files for PHM

1. BED file with peak coordinates
2. Binary file of normalized counts

\+ Store common samples between ChIP-seq and genotype data to ensure proper ordering of samples across data modalities

In [4]:
# Read in count matrices
# CHANGE PATHS
count_mtx_k4me1 = pd.read_csv(os.path.join(data_path, "H3K4me1_chr22.bed"), sep="\t")
count_mtx_k4me1["mark"] = "H3K4me1"

count_mtx_k27ac = pd.read_csv(os.path.join(data_path, "H3K27ac_chr22.bed"), sep="\t")
count_mtx_k27ac["mark"] = "H3K27ac"

all_peaks_mtx = pd.concat([count_mtx_k4me1, count_mtx_k27ac])
all_peaks_mtx = all_peaks_mtx.sort_values(["#Chr", "start"])
all_peaks_mtx = all_peaks_mtx.rename(columns={"#Chr": "#chr"})
all_peaks_mtx.loc[:, "#chr"] = all_peaks_mtx.loc[:, "#chr"].astype(str)

all_peaks_mtx["Peak"] = np.arange(1, all_peaks_mtx.shape[0] + 1)

all_peaks_mtx = all_peaks_mtx.astype({"start": int, "end": int})
all_peaks_mtx = all_peaks_mtx.dropna(axis="columns")


chromosomes = list(
    set(all_peaks_mtx.loc[:, "#chr"]) - set(["chrX", "chrY", "chrM", "X", "Y", "M"])
)
if not all([isinstance(el, str) for el in chromosomes]):
    sys.exit("Chromosome IDs have different types! Standardize the format")

if "chr" in str(chromosomes[0]):
    suffix = ""
else:
    suffix = "chr"

# Get common samples between ChIP-seq and genotype data
final_samples = sorted(
    list(
        set(all_peaks_mtx.columns).intersection(
            set(vcf_samples_dict.get(dataset).iloc[:, 0])
        )
    )
)

# Store only common samples between ChIP-seq and genotype data
pd.DataFrame(final_samples).to_csv(
    os.path.join(output_path, dataset + "_LCL_chr22_samples.txt"),
    sep="\t",
    index=False,
    header=False,
)

first_cols = ["#chr", "start", "end", "pid", "gid", "strand"]
last_cols = ["mark", "Peak"]
ordered_cols = first_cols + final_samples + last_cols
columns_for_tsv = ["Peak"] + final_samples

count_matrix = all_peaks_mtx.loc[:, ordered_cols].copy()

count_matrices_by_chromosome_dict = {
    suffix + str(chromosome): chromosome_count_matrix.sort_values("start")
    for chromosome, chromosome_count_matrix in count_matrix.groupby("#chr")
    if chromosome in chromosomes
}

for chromosome, chromosome_count_matrix in count_matrices_by_chromosome_dict.items():
    if (chromosome != "chrX") and (chromosome != "chrY") and (chromosome != "chrM"):
        path = os.path.join(output_path, str(chromosome))
        if not os.path.exists(os.path.join(path, "hm_output")) and not os.path.exists(
            os.path.join(path, "phm_output")
        ):
            os.makedirs(os.path.join(path, "hm_output"))
            os.makedirs(os.path.join(path, "phm_output"))
        chromosome_count_matrix["Peak"] = np.arange(
            1, chromosome_count_matrix.shape[0] + 1
        )
        chromosome_count_matrix_for_tsv = chromosome_count_matrix[columns_for_tsv]

        # Create .bed file with peak coordinates
        if type(chromosome_count_matrix["#chr"].iloc[0]) != int:
            chromosome_count_matrix["#chr"] = chromosome_count_matrix[
                "#chr"
            ].str.replace("chr", "")
            chromosome_count_matrix["#chr"] = chromosome_count_matrix["#chr"].astype(
                int
            )
        chromosome_count_matrix[["#chr", "start", "end", "mark"]].to_csv(
            os.path.join(path, chromosome + "_peak_coordinates.bed"),
            sep="\t",
            index=False,
            header=False,
        )

        # Create a binary file from normalized counts
        records_array = chromosome_count_matrix[final_samples].to_records(index=False)
        records_array.tofile(
            os.path.join(path, "normalized_counts_" + chromosome + ".bin")
        )

### IMPORTANT:

1. Make sure the samples in a VCF file are orderded in the same way as samples in count matrices.
2. Pay attention to the chromosome formats in a VCF file and count matrices, it should be the same, and formated as a string without "chr" prefix, i.e., chr22 -> 22.
3. Make sure to change paths prior to executing the cell!

In [5]:
%%bash
dataset="test_data";
data_path="./Chromatin_modules/test_data";
output_path="./PHM_test";

## Subset samples from the VCF file
bcftools view \
    -S ${output_path}/${dataset}_LCL_chr22_samples.txt \
    ${data_path}/LCL_genotypes_chr22.vcf.gz \
    > ${output_path}/${dataset}_sample_intersection.vcf

## zip and index VCF
bgzip ${output_path}/${dataset}_sample_intersection.vcf
tabix -p vcf ${output_path}/${dataset}_sample_intersection.vcf.gz


### Compresses and index files per chromosome

In [6]:
%%bash
dataset="test_data";
output_path="./PHM_test";
input_vcf=${output_path}/${dataset}_sample_intersection.vcf.gz

cd ${output_path}
for chromosome_dir in */
do  
    ## Subset a chromosome from VCF file
    tabix \
        ${input_vcf} \
        ${chromosome_dir%/} \
        > ${chromosome_dir%/}/${dataset}_${chromosome_dir%/}.vcf

    ## If chromosome IDs in the VCF file contain "chr" character,
    ## replace it with an empty string, s.t. "chr22" -> "22"
    awk '{gsub(/^chr/,""); print}' ${chromosome_dir%/}/${dataset}_${chromosome_dir%/}.vcf \
    > ${chromosome_dir%/}/${dataset}_${chromosome_dir%/}_no_chr.vcf

    ## Compress and index BED, VCF files
    for filename in ${chromosome_dir%/}/*.bed
    do
        bgzip $filename
        tabix -p bed $filename.gz
    done

    for filename in ${chromosome_dir%/}/*.vcf
    do
        bgzip $filename
        tabix -p vcf $filename.gz
    done
done


## # If chromosome IDs in the VCF file *do not* contain "chr",
## # replace lines 10-18 with:
## chr_id=${chromosome_dir//"chr"/}
## tabix \
##     ${input_vcf} \
##     ${chr_id%/} \
##     > ${chromosome_dir%/}/${dataset}_${chromosome_dir%/}_no_chr.vcf

In [7]:
print("DONE")

DONE
