#### Conversion of ID identifiers to locus_tag identifiers in all supplemental files
This notebooks converts all the supplemental files that contain IDs as gene/protein identifieres to supplemental files with locus_tag as identifiers. This is to adjust the supplemnetal files to the format of how gene and protein sequences are provide. In addition, using locus_tags will ease the comparison of this genomes with others.
Files with the following 

In [14]:
import pandas as pd
import os
import re

In [9]:
#input folder and output folder
INPUT='../supplemental_files/'
OUTPUT='../supplemental_files/ID_to_locus_tag'
if not os.path.exists(OUTPUT):
    os.mkdir(OUTPUT)
GENOME_PATH = '../Assembly/'

#input gff files
P_GFF = pd.read_csv(os.path.join(GENOME_PATH, 'Pst_104E_v13_p_ctg.anno.gff3'), sep='\t', header=None)
H_GFF = pd.read_csv(os.path.join(GENOME_PATH, 'Pst_104E_v13_h_ctg.anno.gff3'), sep='\t', header=None)

#Supplemental files to change
sups = ['_2', '_3', '_4', '_5', '_6', '_7','_8','_9', '_10', '_11']

In [45]:
def getIdToLocusDict(p_gff=P_GFF, h_gff=H_GFF):
    '''returns a dictionary with key: ID and val: locus_tag from a
    DataFrame of haplotig and primary contig gff3 files containing
    attribute entries (column 9) such as:
    ID=evm.TU.pcontig_000.1;locus_tag=DK0911_00000;Name=EVM prediction pcontig_000.1'''
    
    p_gff_genes = p_gff.loc[(p_gff[2] == 'mRNA') | (p_gff[2] == 'gene')]
    h_gff_genes = h_gff.loc[(h_gff[2] == 'mRNA') | (h_gff[2] == 'gene')]
    
    dSeries = pd.concat([h_gff_genes[8], p_gff_genes[8]], ignore_index=True)

    idSearch = re.compile(r'ID=(.*?);')
    locusSearch = re.compile(r'^.*locus_tag=(.*?)(;|$)')

    d = {}

    for attr in dSeries:
        key = idSearch.match(attr).group(1)
        val = locusSearch.match(attr).group(1)
        if key in d.keys():
            print('Unexpected: locus tag: %s is already in dictionary!' % key)
        d[key] = val
    return d

In [46]:
def mapWithDict(x, used_dict):
    if x == 'NaN':
        return x
    if x in used_dict:
        return used_dict[x]
    print("x: %s\n is not in the dictionary mapping loci to id." %x)
    sys.exit()

In [47]:
files_to_change = [os.path.join(INPUT, x) for x in os.listdir(INPUT) if \
                   any([y in x for y in sups]) ]

In [48]:
id_To_loc_dict = getIdToLocusDict()

In [81]:
idSearch = re.compile(r'(evm.\S*)')
for file in files_to_change:
    out_name = file.split('/')[-1].split('.')[0]
    out_name = '%s_id_to_locus_tag.txt' % os.path.join(OUTPUT, out_name)
    print(out_name)
    with open(file) as in_fh:
        with open(out_name, 'w+') as out_fh:
            for line in in_fh:
                if 'evm'  in line:
                    new_line = line.rstrip()
                    match_list = idSearch.findall(line)
                    for found in match_list:
                        new_line = new_line.replace(found, id_To_loc_dict[found])
                    print(new_line, file=out_fh)
                else:
                    new_line = line.rstrip()
                    print(new_line, file=out_fh)

../supplemental_files/ID_to_locus_tag/Supplemental_file_11_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_3_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_2_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_9_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_8_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_6_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_5_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_10_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_7_id_to_locus_tag.txt
../supplemental_files/ID_to_locus_tag/Supplemental_file_4_id_to_locus_tag.txt
