In [108]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import ExactPosition
from Bio.SeqFeature import FeatureLocation
import pandas as pd
import os
import zlib
import gzip

In [109]:
DEBUG = False

if DEBUG :
    ids='rso_test'
    gbks = '/Users/devseeva/Desktop/work/sm_workflow/geneTrack/inputs/panX/'+ids
    core_ogs = '/Users/devseeva/Desktop/work/rso/OrthoFinder/orthoF_msa_'+ids+'/Species_Tree/Orthogroups_for_concatenated_alignment.txt'
    ogs2prot = '/Users/devseeva/Desktop/work/rso/OrthoFinder/orthoF_msa_'+ids+'/Orthogroups/Orthogroups.tsv'
else:
    gbks = snakemake.input['gbs']
    core_ogs = snakemake.input["core_ogs"]
    ogs2prot = snakemake.input["ogs2prot"]

In [110]:
# get core protein gene ids

f = open(core_ogs,"r")
ogs = []
for l in f:
    ogs.append(l.replace('\n',''))
f.close()

tab_prot = pd.read_csv(ogs2prot, sep='\t', index_col=0)  
tab_prot = tab_prot.loc[ogs]
tab_prot = tab_prot.rename(columns=lambda x: str(x).replace('_protein','')) # ncbi file names correction
tab_prot

Unnamed: 0_level_0,GCF_001887535.1_ASM188753v1,GCF_013306235.1_ASM1330623v1,GCF_013306335.1_ASM1330633v1,GCF_013306435.1_ASM1330643v1,GCF_013306935.1_ASM1330693v1,GCF_013375735.1_ASM1337573v1
Orthogroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
OG0000225,WP_003261911.1,WP_003261911.1,WP_003261911.1,WP_003261911.1,WP_003261911.1,WP_003261911.1
OG0000226,WP_003261929.1,WP_003261929.1,WP_003261929.1,WP_003261929.1,WP_003261929.1,WP_003261929.1
OG0000227,WP_003261939.1,WP_003261939.1,WP_003261939.1,WP_003261939.1,WP_003261939.1,WP_003261939.1
OG0000228,WP_003261941.1,WP_003261941.1,WP_003261941.1,WP_003261941.1,WP_003261941.1,WP_003261941.1
OG0000229,WP_003262123.1,WP_003262123.1,WP_003262123.1,WP_003262123.1,WP_003262123.1,WP_003262123.1
...,...,...,...,...,...,...
OG0004089,WP_155772886.1,WP_148668905.1,WP_148668915.1,WP_148668915.1,WP_148668915.1,WP_155772886.1
OG0004091,WP_155773184.1,WP_173940865.1,WP_173941733.1,WP_119889930.1,WP_173952290.1,WP_086706436.1
OG0004092,WP_155773186.1,WP_144061950.1,WP_119417730.1,WP_119889926.1,WP_119417730.1,WP_038938529.1
OG0004093,WP_155773188.1,WP_155739038.1,WP_155772892.1,WP_155772892.1,WP_155772892.1,WP_155773188.1


In [111]:
# table to list
core_prot_genes = set(tab_prot.values.flatten())
len(core_prot_genes)

8460

# Parsing Prokka or NCBI generated GB files 
Saved info about each protein of each strain 
+ get strains name as DB id
+ as id for NCBI proteins use protein_id, for Prokka use locus_tag

## Get proteins location information from GBs

All required information:
+ .type
+ .location
+ .qualifiers['product']
+ .qualifiers['protein_id']
+ .qualifiers['translation']

Note: do not consider fuzzy positions!

## TODO how to consider fuzzy positions?

In [112]:
genomes2prot_db = {}

for entry in os.scandir(gbks):
    if (entry.path.endswith(".gbk") and entry.is_file()):
        
        print('Genome:', entry)
        count = 0
        dic_prot_db = {}
        for record in SeqIO.parse(entry.path, "gb"):
            print('Chr',record.id, 'with', len(record.features), 'features')
            #print(record.features[2])

            for f in record.features:
                if f.type == 'source':
                    strain = ''.join(f.qualifiers['strain']).replace(' ', '_')
                if f.type == 'CDS' and \
                    'locus_tag' in f.qualifiers.keys() and 'translation' in f.qualifiers.keys():
                    
                    if 'protein_id' in f.qualifiers.keys(): 
                        acc = f.qualifiers['protein_id'][0]
                    else:
                        acc = f.qualifiers['locus_tag'][0]

                    if acc in core_prot_genes:
                        #--------
                        cds_info = {'chr':record.id,
                                    #'location': f.location,
                                    'nucl':f.location.extract(str(record.seq)), 
                                    #zlib.compress(str(f.location.extract(str(record.seq))).encode()),                             
                                    'product': f.qualifiers['product'],
                                    'acc':acc
                                    #'aa':f.qualifiers['translation'][0][1:]
                                    }

                        # to remove the stop codon seq       
                        if type(f.location.start) is ExactPosition and type(f.location.end) is ExactPosition:
                            codon_start = int(f.qualifiers['codon_start'][0])-1                 
                            cds_info['nucl'] = cds_info['nucl'][codon_start:-3]

                        if len(f.qualifiers['translation'][0])*3 != len(cds_info['nucl']):
                            print('Length!', len(f.qualifiers['translation'][0])*3, len(cds_info['nucl']))

                        if(f.qualifiers['translation'][0][1:] != Seq(cds_info['nucl'][3:]).translate()):
                            print(acc)
                            print('must:',f.qualifiers['translation'][0][1:])
                            print('is:',Seq(cds_info['nucl'][3:]).translate())
                            count = count +1
                        
                        cds_info['nucl'] = zlib.compress(str(cds_info['nucl']).encode())

                        dic_prot_db[acc] = cds_info
                        #------------
        genomes2prot_db[strain+'$'+entry.name.replace('.gbk','').replace('_genomic','')] = dic_prot_db
    print()

Genome: <DirEntry 'GCF_013306935.1_ASM1330693v1_genomic.gbk'>
Chr NZ_CP052126.1 with 7264 features
Chr NZ_CP052127.1 with 3123 features

Genome: <DirEntry 'GCF_013306235.1_ASM1330623v1_genomic.gbk'>
Chr NZ_CP052076.1 with 6856 features
Chr NZ_CP052077.1 with 3177 features


Genome: <DirEntry 'GCF_013306435.1_ASM1330643v1_genomic.gbk'>
Chr NZ_CP052096.1 with 6822 features
Chr NZ_CP052097.1 with 3221 features


Genome: <DirEntry 'GCF_001887535.1_ASM188753v1_genomic.gbk'>
Chr NZ_CP016554.1 with 7588 features
Chr NZ_CP016555.1 with 3271 features


Genome: <DirEntry 'GCF_013306335.1_ASM1330633v1_genomic.gbk'>
Chr NZ_CP052086.1 with 7262 features
Chr NZ_CP052087.1 with 3127 features

Genome: <DirEntry 'GCF_013375735.1_ASM1337573v1_genomic.gbk'>
Chr NZ_CP056085.1 with 7282 features
Chr NZ_CP056086.1 with 3165 features



In [113]:
if DEBUG :
    pd.DataFrame.from_dict(genomes2prot_db).to_pickle('../'+ids+'_outputs/GENOME_PROTEIN_DB_'+ids+'.pkl')
else:
    pd.DataFrame.from_dict(genomes2prot_db).to_pickle(snakemake.output[0])
    
genomes2prot_db#['pe1$pe1']['WP_165858718.1']

{'FJAT1303.F50$GCF_013306935.1_ASM1330693v1': {'WP_071507848.1': {'chr': 'NZ_CP052126.1',
   'nucl': b'x\x9cE\x94\xd1\x91\x031\x08Ckc\xf8\xa0\x01\xfa\xaf%\xe8I\xde\\2{\x8e\r\x02$yk\xa7k\xa6V\x7fs\xeb\xfb}\xfff\xbb\xa7\xfb\x8ez{\xea>z\xf6N5\'U\xf7\xdd\xbd\xc7\xed\xcc\n@k\x1d)&\x8b\xdb[\xa5\x93[\xc0\xba\x02E\xee\x8c8\xea\xd4\xad\xd6\x80T\xbc\x90\x19\xe3\x8f\xdaP{\x97C\xc2\xd5\xd2N\x13\t\xa6\xe2\x9dr\xf1\x9e\xc8uY+s\x03\xdb<\x04\x03l\xf3\x04\xb5\x9d\xc8\tXM0=~G\xec\xab\xe0uVo\xdfD1p\x06\x9a24G\x009\xd8\xc3.\x1d\x8c\x10Rb\x08\x98p\x1e\t(\xaca3\x88\x18\xd2\xe1\xe3W\xacz\x88\xea\x10\x12\xe4}SxJ\xce\xb5\xb5\xaeE\xda\xa2i\x9b\x8c\x00*\x1e-\x87\x1c\xd8k\xc7\xad\x1b\xbc\x13\x9a\x96\x04\xdaj\xeb\xab\xcd\xa5=\x17vce\xe1\x98\x9a\xcf\xbc6"P\xdbG\xd4%\xcb#xGt\nVO\xe8\xa07zp0S_\x03\x83\x1f\x16\x05]]]\xd9f\xd05\x01\xccg\x0b\x0bo\xd9H"A\x11E!\xd9\xcc\x84\x97\xe8B\xd6\xadX\xa5\xec\xe1X\xbacW\xa0\x88\xc1\x8f\x08\xd469\xbd98\xa6]e\xed\x9b\x17\xdb\xa7\x85\xe1w\x02\xf0\x00\x85`\xcf8\xb9&| \xb0\xfcuwE\xfbvx\x

In [114]:
#pd.DataFrame.from_dict(genomes2prot_db).to_csv('hghg.csv')
#genomes2prot_db['FJAT1303.F50$GCF_013306935.1_ASM1330693v1']['WP_071507848.1']['nucl']

In [115]:
#zlib.decompress(genomes2prot_db['FJAT1303.F50$GCF_013306935.1_ASM1330693v1']['WP_071507848.1']['nucl']).decode()