# Finding exons targeted by probes

In [None]:
#The file "exons_in_filt_probes" contains 261185 lines
#NOTE: these exons belong to all tx (53817) and genes (18085) present in the filtered probes list (sp+unsp)
#Now, we want to see out of these exons, which ones are targeted by probes

## A. To check for the exons targeted by spliced probes

In [8]:
#SPLICED PROBES (which span exon-exon junction; can target more than one exon)
#Most spliced probes target two exons
#Some spliced probes target more than two exons for example: eac38ce

### A.1 Spliced probes that overlap two exons

In [None]:
#Goal is to find which probe target which two consecutive exons. For this:
# 1. First, for a probe check if probe_id till first instance of "|" matches exon_id till first instance of "|"
# 2. If it matches then check probe coordinates and coordinates of pairs of exons
# 3. For that probe and those two consecutive exons check the following condition:
# exon1_Start <= probe_Start <= exon1_End and exon2_Start <= probe_End <= exon2_End

#If it is true save the probe_id and those two exon_ids in a new output file. Do this for all probes.

In [None]:
#the chunk below creates a list of filtered spliced probes (7477) that target two pairs of exons instead of consecutive

In [3]:
# Define a function to extract the gene IDs from probe_id and exon_id
def extract_gene_id(id_string):
    return id_string.split('|')[0]

# Read exon data and organize it by gene ID
exon_data_by_gene = {}
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/exons_in_filt_probes', 'r') as exon_file:
    for line in exon_file:
        chromosome, exon_start, exon_end, exon_id = line.strip().split('\t')
        exon_gene_id = extract_gene_id(exon_id)
        exon_start, exon_end = int(exon_start), int(exon_end)

        if exon_gene_id in exon_data_by_gene:
            exon_data_by_gene[exon_gene_id].append((exon_id, exon_start, exon_end))
        else:
            exon_data_by_gene[exon_gene_id] = [(exon_id, exon_start, exon_end)]

# Open the probe file for reading
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filtered_spliced_probes.tsv', 'r') as probe_file:
    probe_lines = probe_file.readlines()

# Create a dictionary to store probe information with exon_ids
probe_data = {}

# Iterate through probe data
for probe_line in probe_lines:
    probe_id, probe_start, probe_end = probe_line.strip().split('\t')
    probe_gene_id = extract_gene_id(probe_id)
    probe_start, probe_end = int(probe_start), int(probe_end)

    # Create a list to store exon_ids for the current probe
    exon_ids = set()

    # Check if the gene ID exists in exon data
    if probe_gene_id in exon_data_by_gene:
        exon_gene_data = exon_data_by_gene[probe_gene_id]

        # Iterate through exons associated with the same gene ID
        for i in range(len(exon_gene_data)):
            for j in range(i + 1, len(exon_gene_data)):
                exon_id1, exon_start1, exon_end1 = exon_gene_data[i]
                exon_id2, exon_start2, exon_end2 = exon_gene_data[j]

                if (exon_start1 <= probe_start <= exon_end1 and exon_start2 <= probe_end <= exon_end2) or \
                   (exon_start2 <= probe_start <= exon_end2 and exon_start1 <= probe_end <= exon_end1):
                    exon_ids.add(exon_id1)
                    exon_ids.add(exon_id2)

    # Add the exon_ids to the probe_data dictionary
    if exon_ids:
        probe_data[probe_id] = list(exon_ids)

# Create an output file to save the results
output_file = open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_sp_probes_targeting_2_exons', 'w')

# Write the merged probe and exon data to the output file
for probe_id, exon_ids in probe_data.items():
    exon_ids_str = '\t'.join(exon_ids)
    output_file.write(f'{probe_id}\t{exon_ids_str}\n')

output_file.close()

### A.2 Spliced probes that overlap three exons

In [None]:
#The following chunk compares coordinates of filt spliced probes and three consecutive exons of same gene_id 
#There is also a condition that the start and end coordinates of those 3 exons should not be same
#this is to avoid taking 3 exons of with overlapping coordinates
#The output is a list of 87 probes

In [1]:
# Define a function to extract the gene IDs from probe_id and exon_id
def extract_gene_id(id_string):
    return id_string.split('|')[0]

# Read exon data and organize it by gene ID
exon_data_by_gene = {}
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/exons_in_filt_probes', 'r') as exon_file:
    for line in exon_file:
        chromosome, exon_start, exon_end, exon_id = line.strip().split('\t')
        exon_gene_id = extract_gene_id(exon_id)
        exon_start, exon_end = int(exon_start), int(exon_end)

        if exon_gene_id in exon_data_by_gene:
            exon_data_by_gene[exon_gene_id].append((exon_id, exon_start, exon_end))
        else:
            exon_data_by_gene[exon_gene_id] = [(exon_id, exon_start, exon_end)]

# Open the probe file for reading
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filtered_spliced_probes.tsv', 'r') as probe_file:
    probe_lines = probe_file.readlines()

# Create a dictionary to store probe information with three consecutive exon_ids
probe_data = {}

# Iterate through probe data
for probe_line in probe_lines:
    probe_id, probe_start, probe_end = probe_line.strip().split('\t')
    probe_gene_id = extract_gene_id(probe_id)
    probe_start, probe_end = int(probe_start), int(probe_end)

    # Check if the gene ID exists in exon data
    if probe_gene_id in exon_data_by_gene:
        exon_gene_data = exon_data_by_gene[probe_gene_id]

        # Iterate through exons associated with the same gene ID
        for i in range(len(exon_gene_data) - 2):
            exon_id1, exon_start1, exon_end1 = exon_gene_data[i]
            exon_id2, exon_start2, exon_end2 = exon_gene_data[i + 1]
            exon_id3, exon_start3, exon_end3 = exon_gene_data[i + 2]

            if (exon_start1 != exon_start2 and
                exon_start2 != exon_start3 and
                exon_start1 != exon_start3 and
                exon_end1 != exon_end2 and
                exon_end2 != exon_end3 and
                exon_end1 != exon_end3 and
                exon_start1 <= probe_start <= exon_end1 and
                probe_start < exon_start2 < exon_end2 and
                exon_start3 <= probe_end <= exon_end3 and
                
                exon_id1 != exon_id2 and
                exon_id2 != exon_id3 and
                exon_id1 != exon_id3):
                probe_data.setdefault(probe_id, set()).add((exon_id1, exon_id2, exon_id3))

# Create an output file to save the results
output_file = open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_sp_probes_targeting_3_exons', 'w')

# Write the probe and consecutive exon data to the output file
for probe_id, exon_sets in probe_data.items():
    for exon_set in exon_sets:
        exon_ids_str = '\t'.join(exon_set)
        output_file.write(f'{probe_id}\t{exon_ids_str}\n')

output_file.close()

### A.3 Merge spliced probes that overlap two and three exons

In [2]:
# Create dictionaries to store probe data from both files
probe_data_2exons = {}
probe_data_3exons = {}

# Read the first file with probes targeting 2 exons
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_sp_probes_targeting_2_exons', 'r') as file_2exons:
    for line in file_2exons:
        probe_id, *exon_ids = line.strip().split('\t')
        probe_data_2exons[probe_id] = exon_ids

# Read the second file with probes targeting 3 exons
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_sp_probes_targeting_3_exons', 'r') as file_3exons:
    for line in file_3exons:
        probe_id, *exon_ids = line.strip().split('\t')
        probe_data_3exons[probe_id] = exon_ids

# Combine the probe data from both files
combined_probe_data = {}

# Merge probe data from the first file
for probe_id, exon_ids in probe_data_2exons.items():
    combined_probe_data.setdefault(probe_id, set()).update(exon_ids)

# Merge probe data from the second file
for probe_id, exon_ids in probe_data_3exons.items():
    combined_probe_data.setdefault(probe_id, set()).update(exon_ids)

# save the results
output_file = open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_sp_probes_targeting_exons', 'w')

# Write the combined probe and exon data to the output file with unique exon_ids
for probe_id, exon_ids in combined_probe_data.items():
    exon_ids_str = ';'.join(exon_ids)
    output_file.write(f'{probe_id} \t {exon_ids_str}\n')

output_file.close()


In [10]:
#The above output creates a list of filtered spliced probes (7477 out of 7502) that target exons
#following chunk is to get  a list of such remaining probes (25)
#eg: probe "VTCN1|98dca25" has no target because tx in not in "all_exon_coordinates_with_id" file (not on ensembl also)

In [3]:
#first create a file "temp" by "cut -f 1 filt_sp_probes_targeting_exons > temp" (to extract first column of probes)
#Now compare this list of probes with all the filtered spliced probes and save non-common probes

# Read the "selected_probe" file and store the probe_ids in a set
selected_probe_ids = set()
with open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/temp", "r") as selected_file:
    for line in selected_file:
        probe_id = line.strip()
        selected_probe_ids.add(probe_id)

# Read the "all_probe" file, and if the probe_id is not in the set, write it to the output file
with open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filtered_spliced_probes.tsv", "r") as all_probe_file, open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_sp_probes_having_no_target", "w") as result_file:
    for line in all_probe_file:
        fields = line.strip().split("\t")
        probe_id = fields[0]
        if probe_id not in selected_probe_ids:
            result_file.write(line)

## B. To check for the exons targeted by unspliced probes

In [None]:
#Goal is to find which probe target which exons. For this:
# 1. First, for a probe check if probe_id till first instance of "|" matches exon_id till first instance of "|"
# 2. If it matches then check probe coordinates and coordinates of exons
# 3. For that probe and that exon check the following condition:
# exon_Start <= probe_Start <= exon_End and exon_Start <= probe_End <= exon_End
#If it is true save the probe_id and those two exon_ids in a new output file. Do this for all probes.

In [3]:
# Define a function to check if a probe matches an exon
def probe_matches_exon(probe, exon):
    probe_id_parts = probe['probe_id'].split('|')
    exon_id_parts = exon['exon_id'].split('|')
    
    # Check if the first part of the probe_id matches the first part of the exon_id
    if probe_id_parts[0] == exon_id_parts[0]:
        # Check the condition for overlap
        if (exon['exonStart'] <= probe['probeStart'] <= exon['exonEnd'] and
            exon['exonStart'] <= probe['probeEnd'] <= exon['exonEnd']):
            return True
    return False

# Read the exon file and store the data in a list
exons = []
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/exons_in_filt_probes', 'r') as exon_file:
    for line in exon_file:
        fields = line.strip().split('\t')
        exon = {
            'chromosome': fields[0],
            'exonStart': int(fields[1]),
            'exonEnd': int(fields[2]),
            'exon_id': fields[3]
        }
        exons.append(exon)

# Read the probe file, check for matches, and write the output
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_unsp_probes_targeting_exons', 'w') as output_file:
    with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filtered_unspliced_probes.tsv', 'r') as probe_file:
        for line in probe_file:
            fields = line.strip().split('\t')
            probe = {
                'probe_id': fields[0],
                'probeStart': int(fields[1]),
                'probeEnd': int(fields[2])
            }
            
            # Check if the probe matches any exon
            matching_exons = [exon for exon in exons if probe_matches_exon(probe, exon)]
            
            if matching_exons:
                # Write the probe_id and matching exon_ids to the output file
                output_file.write(f'{probe["probe_id"]}\t{";".join([exon["exon_id"] for exon in matching_exons])}\n')

In [2]:
#in the above output, out of 41890 filt_unsp_probes only 41641 probes are there because of several reasons (documented)
#eg: some probes target transcripts not present in "all_exon_coordinates_with_id" file 
#for eg: ENSG00000015133|CCDC88C|37b6d1b, f3291e6 target "ENST00000331194"

#following chunk is to get  a list of remaining probes (249)
#first create a file "temp" by "cut -f 1 filt_unsp_probes_targeting_exons > temp" (to extract first column of probes)
#Now compare this list of probes with all the filtered unspliced probes and save non-common probes
#grep -Fvf temp filtered_unspliced_probes.tsv | awk -F'\t' '{print $1}' > filt_unsp_probes_having_no_target

## C. To check for the exons targeted by misidentified probes

In [None]:
#Misidentified Probes (which belong to unsp category in 10x list, diff of probe coordinates is more than 50bp)

In [5]:
#The following chunk compares coordinates of filt misid probes and pair of exons of same gene_id 

# Define a function to extract the gene IDs from probe_id and exon_id
def extract_gene_id(id_string):
    return id_string.split('|')[0]

# Read exon data and organize it by gene ID
exon_data_by_gene = {}
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/exons_in_filt_probes', 'r') as exon_file:
    for line in exon_file:
        chromosome, exon_start, exon_end, exon_id = line.strip().split('\t')
        exon_gene_id = extract_gene_id(exon_id)
        exon_start, exon_end = int(exon_start), int(exon_end)

        if exon_gene_id in exon_data_by_gene:
            exon_data_by_gene[exon_gene_id].append((exon_id, exon_start, exon_end))
        else:
            exon_data_by_gene[exon_gene_id] = [(exon_id, exon_start, exon_end)]

# Open the probe file for reading
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filtered_misidentified_probes.tsv', 'r') as probe_file:
    probe_lines = probe_file.readlines()

# Create a dictionary to store probe information with exon_ids
probe_data = {}

# Iterate through probe data
for probe_line in probe_lines:
    probe_id, probe_start, probe_end = probe_line.strip().split('\t')
    probe_gene_id = extract_gene_id(probe_id)
    probe_start, probe_end = int(probe_start), int(probe_end)

    # Create a list to store exon_ids for the current probe
    exon_ids = set()

    # Check if the gene ID exists in exon data
    if probe_gene_id in exon_data_by_gene:
        exon_gene_data = exon_data_by_gene[probe_gene_id]

        # Iterate through exons associated with the same gene ID
        for i in range(len(exon_gene_data)):
            for j in range(i + 1, len(exon_gene_data)):
                exon_id1, exon_start1, exon_end1 = exon_gene_data[i]
                exon_id2, exon_start2, exon_end2 = exon_gene_data[j]

                if (exon_start1 <= probe_start <= exon_end1 and exon_start2 <= probe_end <= exon_end2) or \
                   (exon_start2 <= probe_start <= exon_end2 and exon_start1 <= probe_end <= exon_end1):
                    exon_ids.add(exon_id1)
                    exon_ids.add(exon_id2)

    # Add the exon_ids to the probe_data dictionary
    if exon_ids:
        probe_data[probe_id] = list(exon_ids)

# Create an output file to save the results
output_file = open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_misid_probes_targeting_exons', 'w')

# Write the merged probe and exon data to the output file
for probe_id, exon_ids in probe_data.items():
    exon_ids_str = ';'.join(exon_ids)
    output_file.write(f'{probe_id}\t{exon_ids_str}\n')

output_file.close()

In [None]:
#in the above output, out of 4112 filtered_misidentified_probes only 4101 probes are there because of several reasons (documented)
#follow steps below to get  a list of remaining probes (11)
#first create a file "temp" by "cut -f 1 filt_misid_probes_targeting_exons > temp" (to extract first column of probes)
#Now compare this list of probes with all the filtered misid probes and save non-common probes
#grep -Fvf temp filtered_misidentified_probes.tsv | awk -F'\t' '{print $1}' > filt_misid_probes_having_no_target
#eg: TRIM16L|b3e755f, SIGLEC5|250bb03, VPS72|abf5189 (tx not present in "all_exon_coordinates_with_id" file)

In [2]:
##########******************DONOT***RUN********************########
#The following chunk compares coordinates of filt misid probes and two consecutive exons of same gene_id 

# # Define a function to extract the gene IDs from probe_id and exon_id
# def extract_gene_id(id_string):
#     return id_string.split('|')[0]

# # Read exon data into a list
# exon_data = []
# with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/exons_in_filt_probes', 'r') as exon_file:
#     for line in exon_file:
#         exon_data.append(line.strip().split('\t'))

# # Open the probe file for reading
# with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filtered_misidentified_probes.tsv', 'r') as probe_file:
#     probe_lines = probe_file.readlines()

# # Create a dictionary to store probe information with exon_ids
# probe_data = {}

# # Iterate through probe data
# for probe_line in probe_lines:
#     probe_id, probe_start, probe_end = probe_line.strip().split('\t')
#     probe_gene_id = extract_gene_id(probe_id)

#     # Create a list to store exon_ids for the current probe
#     exon_ids = []

#     # Iterate through exon data
#     for i in range(len(exon_data)):
#         chromosome, exon_start, exon_end, exon_id = exon_data[i]
#         exon_gene_id = extract_gene_id(exon_id)

#         # Check if the gene IDs match
#         if probe_gene_id == exon_gene_id:
#             exon_start, exon_end = int(exon_start), int(exon_end)
#             probe_start, probe_end = int(probe_start), int(probe_end)

#             # Check if the probe coordinates match the conditions
#             if i < len(exon_data) - 1:
#                 next_chromosome, next_exon_start, next_exon_end, next_exon_id = exon_data[i + 1]
#                 next_exon_start, next_exon_end = int(next_exon_start), int(next_exon_end)

#                 if exon_start <= probe_start <= exon_end and next_exon_start <= probe_end <= next_exon_end:
#                     exon_ids.extend([exon_id, next_exon_id])

#     # Add the exon_ids to the probe_data dictionary
#     probe_data[probe_id] = exon_ids

# # Create an output file to save the results
# output_file = open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_misid_probes_targeting_exons', 'w')

# # Write the merged probe and exon data to the output file
# for probe_id, exon_ids in probe_data.items():
#     if exon_ids:
#         exon_ids_str = ';'.join(exon_ids)
#         output_file.write(f'{probe_id}\t{exon_ids_str}\n')

# output_file.close()
#in the above output, out of 4112 filtered_misidentified_probes only 4070 probes are there because of several reasons (documented)

## D. All filtered probe target exons

In [None]:
#Now, goal is to prepare a single file having all the filtered probes and their targeted exons

In [6]:
# Initialize a dictionary to store probe IDs and their associated exon IDs
probe_exon_dict = {}

# List of file paths
file_paths = ['/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_sp_probes_targeting_exons', 
              '/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_unsp_probes_targeting_exons',
              '/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/filt_misid_probes_targeting_exons']

# Loop through the files and merge the contents
for file_path in file_paths:
    with open(file_path, 'r') as file:
        for line in file:
            fields = line.strip().split('\t')
            probe_id = fields[0]
            exon_ids = fields[1].split(';')
            if probe_id in probe_exon_dict:
                probe_exon_dict[probe_id].extend(exon_ids)
            else:
                probe_exon_dict[probe_id] = exon_ids

# Write the merged content to an output file
output_file = '/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/ALL_filt_probes_targeting_exons'
with open(output_file, 'w') as out_file:
    for probe_id, exon_ids in probe_exon_dict.items():
        unique_exon_ids = ';'.join(set(exon_ids))
        out_file.write(f"{probe_id}\t{unique_exon_ids}\n")

Merged content written to /work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/ALL_filt_probes_targeting_exons


In [None]:
#The above output contains list of all filtered probes and their target exons
#Out of 53504 probes, we found targets for only 53219 probes

In [None]:
#The above output file can be converted from ; separated to \t separated exon_ids
########  sed 's/;/\t/g' ALL_filt_probes_targeting_exons > tabsep_ALL_filt_probes_targeting_exons  #########

In [None]:
#Some lines have multiple exon_ids separated by a ";".
#To count how many unique exon_ids are there in total in "ALL_filt_probes_targeting_exons" file:

#cut -f 2 file.txt | tr ';' '\n' | sort | uniq | wc -l
#OR
#awk -F'\t' '{split($2, exon_ids, ";"); for (i in exon_ids) count[exon_ids[i]]++} END {print "Total unique exon_ids:", length(count)}' file.txt

In [None]:
#OR
# Create a set to store unique exon_ids
# unique_exon_ids = set()

# # Open the file and read line by line
# with open('file.txt', 'r') as file:
#     for line in file:
#         # Split the line by tab and get the exon_ids
#         fields = line.strip().split('\t')
#         exon_ids = fields[1].split(';')

#         # Add the exon_ids to the set
#         unique_exon_ids.update(exon_ids)

# # Count the number of unique exon_ids
# count = len(unique_exon_ids)
# print("Total unique exon_ids:", count)