## Preparing the probes and exon files

### Find out list of individual genes and transcripts present in filtered probes

In [4]:
#First, make a file "gene_tx" having two columns: probe_id and ; separated transcripts (copied from FILT_probe_file two fields)
#NOTE: Take only filtered probes (sp+unsp) in "gene_tx" file
#awk 'BEGIN {FS=OFS="\t"} {sub(/\|[^|]*$/, "", $1); print}' gene_tx > gene_tx2 (this will remove probe_hash after second "|")

In [None]:
#gene_tx was deleted after above command
#In the "gene_tx2" remove the duplicate lines 

In [2]:
unique_lines = set()

with open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/gene_tx2", "r") as infile:
    for line in infile:
        gene_id, transcript_id = line.strip().split("\t")
        unique_lines.add((gene_id, transcript_id))

with open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/gene_tx3", "w") as outfile:
    for gene_id, transcript_id in unique_lines:
        outfile.write(f"{gene_id}\t{transcript_id}\n")

#After removal of duplicate lines, the number of lines are 20290 because some genes may have multiple lines for diff tx 

In [None]:
#Now we want to separate the tx id such that there is one tx per line
#the code makes a new entry for ; separated transcripts and removes duplicates
#input file was later deleted from folder, only the output file is saved

In [4]:
# Read the input file
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/gene_tx3', 'r') as infile:
    lines = infile.readlines()

# Create a set to store unique lines
unique_lines = set()

# Process each line and split it by the ';' delimiter
for line in lines:
    gene_id, transcript_ids = line.strip().split('\t')
    transcript_ids = transcript_ids.split(';')
    
    for transcript_id in transcript_ids:
        # Create a new line with the gene_id and the transcript_id
        new_line = f'{gene_id}\t{transcript_id}'
        unique_lines.add(new_line)

# Write the unique lines to the output file
with open('/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/gene_tx_id_in_filt_probes', 'w') as outfile:
    for line in unique_lines:
        outfile.write(line + '\n')

In [None]:
#The above output file contains gene_ids and their corresponding tx_ids from the filt_probes file.
#so 53817 lines (one for each transcript)

In [3]:
#A gene may have many transcripts. But some of those transcripts may not be present in filtered probes list.
#file "all_genes_transcript_ids.tsv" contains all transcripts (100525) of all genes (46639) as in transcripts_info.csv file
#file "tx_in_filt_probes" contains only those transcripts (53817) present in filt probes list

In [None]:
#There are 67 transcripts that are present in filt_probes list but not present in transcripts_info.csv file
#these are in the file "tx_present_in_filt_probes_but_not_in_txinfo_file"
#so for probes of such tx we do not have any matching exons (for eg: ENST00000515828, ENST00000237449)

### pre-processing of "all_exon_coordinates_with_id"

In [None]:
#"all_exon_coordinates_with_id" file contains exons of all the transcripts. But we need to keep only those exons which
#belong to transcripts present in filtered probes. This is done in pre-processing step

In [None]:
#First we need to edit exon_id to tx_id by removing everything after a "."

In [None]:
#cut -f 1,2,3,4 all_exon_coordinates_with_id > output
# awk 'BEGIN {FS=OFS="\t"} {sub(/\..*/, "", $4)}1' output > output2  (this will crop the exon_id after first ".")

### create new exon_ids

In [None]:
#"all_exon_coordinates_with_id" file contains many cases like - one exon belonging to many transcripts of same gene
#so there are duplicate exons (by coordinates)
#same coordinates but different exon_id, because they belong to different transcripts
#so we need to avoid this in hgtable

In [7]:
# Define a dictionary to store the mapping between transcript_id and gene_id
transcript_to_gene = {}

# Read the second file (transcript_id to gene_id mapping)
with open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/gene_tx_id_in_filt_probes", "r") as file:
    for line in file:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            gene_id, transcript_id = parts
            transcript_to_gene[transcript_id] = gene_id

# Read the first file and replace transcript_id with gene_id
with open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/output2", "r") as file:
    lines = file.readlines()

# Create a new file for the output
with open("/work/FAC/FBM/DBC/cdessim2/default/amaurya/Exons/output_file", "w") as output_file:
    for line in lines:
        fields = line.strip().split("\t")
        if len(fields) == 4:
            transcript_id = fields[3]
            if transcript_id in transcript_to_gene:
                fields[3] = transcript_to_gene[transcript_id]

                # Edit the gene_id format
                gene_id = fields[3]
                fields[3] = f"{gene_id}|{fields[1]}-{fields[2]}"
                
                new_line = "\t".join(fields) + "\n"

                if "ENST00" not in new_line:  # Check if "ENST00" is not present in the line
                    output_file.write(new_line)

print("Replacement, formatting, and filtering complete.")

Replacement, formatting, and filtering complete.


In [None]:
#The above output has 561361 lines
#The original all_exon_coordinates_with_id file contains transcript_ids so if an exon is present in many transcripts
#then there are repeated lines of same exon coordinates (those are removed)
#To remove the duplicate lines
# awk -F'\t' '!seen[$0]++' output_file > exons_in_filt_probes
# Now there are 261185 lines
#NOTE: these exons belong to all tx (53817) and genes (18085) present in the filtered probes list (sp+unsp)
#this file can be used as an exon_hgtable

### post-processing to get final hgtable

In [None]:
#the file "exons_in_filt_probes" created above during new exon_ids can be used as an exon_hgtable