In [63]:
import pandas as pd
df = pd.read_table('../virushostdb.daily.tsv')
df.columns.values

array(['virus tax id', 'virus name', 'virus lineage', 'refseq id',
       'KEGG GENOME', 'KEGG DISEASE', 'DISEASE', 'host tax id',
       'host name', 'host lineage', 'pmid', 'evidence', 'sample type',
       'source organism'], dtype=object)

筛选phage-host条目

In [64]:
column_list= ['virus name', 'refseq id', 'virus tax id', 'virus lineage', 'evidence',
                'host name', 'host tax id', 'host lineage']

df2 = df[column_list].copy()

df2.columns = ['virus_name', 'refseq_id', 'virus_taxid', 'virus_lineage', 'evidence',
                'host_name', 'host_taxid', 'host_lineage']


In [65]:
df3 = df2.dropna(axis=0, how='any')


In [66]:
def select_phage(x):
    """
    Selects phage  from all viruses.
    """
    try:
        kingdom = x.split(';')[0].strip()
        if kingdom == 'Bacteria' or kingdom == 'Archaea':
            return 1
        elif kingdom == 'Eukaryota':
            return 0
    except:
        print(x)


df_phage = df3[df3['host_lineage'].apply(select_phage) == 1].copy()

In [67]:
df_phage['host_taxid'] = df_phage['host_taxid'].astype(int)
df_phage.to_csv('../virushostdb.daily.phage.tsv', sep='\t', index=False)


发现24个phage有多个id

In [68]:

df_phage_multiple=df_phage[df_phage['refseq_id'].str.contains(',')]
df_phage_multiple.to_csv('../virushostdb.daily.phage.multiple.tsv', sep='\t', index=False)

In [69]:
len(set(df_phage_multiple['virus_name']))

24

检查后发现只有9个phage不完整，其他的只是同一个名称的不同测序结果，对不完整的phage直接进行删除

In [70]:

# 过滤不完整病毒,9个
uncompleted_virus = ['Bacillus phage SPG24', 'Pseudomonas phage phi12', 'Pseudomonas phage phi13', 
'Pseudomonas phage phi2954','Pseudomonas phage phi8', 'Pseudomonas phage phiNN', 
'Pseudomonas phage phiYY', 'Pseudomonas virus phi6', 'Salmonella phage SP069']

df_completed = df_phage[~df_phage['virus_name'].isin(uncompleted_virus)]
print(df_phage.shape)
print(df_completed.shape)
df_phage=df_completed.copy()

(5204, 8)
(5195, 8)


In [71]:
# 保存
df_phage.to_csv('../virushostdb.daily.phage.tsv', sep='\t', index=False)


VHDB phage name有重复的是因为一个phage可能记录了多个host

In [72]:
df_phage['virus_name'].value_counts()


Salmonella phage OSY-STA           14
Enterobacteria phage PRD1           8
Bacillus phage vB_BsuM-Goe3         8
Bacillus phage SerPounce            7
Bacillus phage BPS10C               6
                                   ..
Lactococcus phage bIL311            1
Haloferax virus HF1                 1
Mycobacterium phage Magnar          1
Propionibacterium phage Anatole     1
Listeria phage LP-030-2             1
Name: virus_name, Length: 4782, dtype: int64

## 添加 tax lineage信息
根据host taxid生成tax lineage信息，只保留界门纲目科属种，如果不在有缺失则用上一级+unclassified

In [73]:
# cd ../
# mkdir -p temp/ lineage/
# !cut -f 7 virushostdb.daily.phage.tsv > temp/taxid.tsv
# !cat temp/taxid.tsv | taxonkit lineage | taxonkit reformat -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}" -P  | cut -f 3- |  tr '\t' ';' | tee temp/lineage.tsv

# !cat temp/taxid.tsv | taxonkit lineage -r -L | tee temp/rank.tsv
# !paste temp/rank.tsv temp/lineage.tsv | tail -n+2 > lineage/lineage_host.tsv
# !rm temp/taxid.tsv temp/lineage.tsv temp/rank.tsv


根据virus taxid生成界门纲目科属种的物种注释

In [74]:
# !cut -f 3 virushostdb.daily.phage.tsv > temp/taxid.tsv
# !cat temp/taxid.tsv | taxonkit lineage | taxonkit reformat -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}" -P  | cut -f 3- |  tr '\t' ';' | tee lineage/lineage_virus.tsv




In [77]:
lineage_host = pd.read_table('../lineage/lineage_host.tsv', header=None,
                        names=['host_taxid', 'host_rank', 'host_lineage'])
lineage_host

Unnamed: 0,host_taxid,host_rank,host_lineage
0,2148,species,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Ac...
1,2148,species,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Ac...
2,85698,species,k__Bacteria;p__Proteobacteria;c__Betaproteobac...
3,85698,species,k__Bacteria;p__Proteobacteria;c__Betaproteobac...
4,85698,species,k__Bacteria;p__Proteobacteria;c__Betaproteobac...
...,...,...,...
5190,632,species,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
5191,632,species,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
5192,632,species,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...
5193,632,species,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...


In [78]:
lineage_virus = pd.read_table('../lineage/lineage_virus.tsv', header=None,
                             names=['virus_lineage'])
lineage_virus


Unnamed: 0,virus_lineage
0,k__Viruses;p__Hofneiviricota;c__Faserviricetes...
1,k__Viruses;p__;c__;o__;f__Plasmaviridae;g__Pla...
2,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...
3,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...
4,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...
...,...
5190,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...
5191,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...
5192,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...
5193,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...


In [79]:
df_phage['host_rank'] = lineage_host['host_rank'].to_list()

df_phage['host_lineage'] =lineage_host['host_lineage'].to_list()
df_phage['virus_lineage'] = lineage_virus['virus_lineage'].to_list()
df_phage=df_phage.reset_index(drop=True)
df_phage.to_csv('../virushostdb.daily.phage.tsv', sep='\t', index=False)


In [80]:
df_phage


Unnamed: 0,virus_name,refseq_id,virus_taxid,virus_lineage,evidence,host_name,host_taxid,host_lineage,host_rank
0,Acholeplasma phage MV-L51,NC_001341,1977403,k__Viruses;p__Hofneiviricota;c__Faserviricetes...,"Literature, RefSeq",Acholeplasma laidlawii,2148,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Ac...,species
1,Acholeplasma virus L2,NC_001447,46014,k__Viruses;p__;c__;o__;f__Plasmaviridae;g__Pla...,"Literature, NCBI Virus, RefSeq",Acholeplasma laidlawii,2148,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Ac...,species
2,Achromobacter phage 83-24,NC_028834,1589747,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...,"Literature, RefSeq",Achromobacter xylosoxidans,85698,k__Bacteria;p__Proteobacteria;c__Betaproteobac...,species
3,Achromobacter phage JWAlpha,NC_023556,1416009,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...,"Literature, RefSeq",Achromobacter xylosoxidans,85698,k__Bacteria;p__Proteobacteria;c__Betaproteobac...,species
4,Achromobacter phage JWDelta,KF787094,1416008,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...,"Literature, NCBI Virus, RefSeq",Achromobacter xylosoxidans,85698,k__Bacteria;p__Proteobacteria;c__Betaproteobac...,species
...,...,...,...,...,...,...,...,...,...
5190,Yersinia phage YpP-G,JQ965702,1176764,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...,"Literature, NCBI Virus, RefSeq",Yersinia pestis,632,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...,species
5191,Yersinia phage YpP-R,JQ965701,1176765,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...,"Literature, NCBI Virus, RefSeq",Yersinia pestis,632,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...,species
5192,Yersinia phage YpP-Y,NC_047939,1176766,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...,RefSeq,Yersinia pestis,632,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...,species
5193,Yersinia phage YpsP-G,NC_047940,1176767,k__Viruses;p__Uroviricota;c__Caudoviricetes;o_...,RefSeq,Yersinia pestis,632,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...,species


In [81]:
df_phage['host_rank'].value_counts()

species            3395
strain             1201
no rank             250
genus               158
serotype             72
subspecies           70
serogroup            20
order                12
family                7
superkingdom          4
species group         3
forma specialis       1
isolate               1
class                 1
Name: host_rank, dtype: int64

In [92]:
## 过滤宿主名有问题的两个
#{'bacterium', 'unidentified bacterial endosymbiont'}
unidentified_host = ['bacterium', 'unidentified bacterial endosymbiont']

df_phage = df_phage[~df_phage['host_name'].isin(unidentified_host)]
df_phage.to_csv('../virushostdb.daily.phage.tsv',
                   sep='\t', index=False)


In [93]:
# 过滤不完整病毒,9个
under_species = ['species', 'strain', 'no rank',
                'serotype', 'subspecies', 'serogroup',
                'forma specialis', 'isolate']

df_filtered = df_phage[df_phage['host_rank'].isin(under_species)]
print(df_phage.shape)
print(df_filtered.shape)
df_filtered.to_csv('../virushostdb.daily.phage(filtered).tsv',
                sep='\t', index=False)


(5193, 9)
(5008, 9)


##  统计信息

In [94]:
print(len(df_phage))
print(len(set(df_phage['host_name'])))
print(len(set(df_phage['virus_name'])))

5193
838
4781


In [95]:
print(len(df_filtered))
print(len(set(df_filtered['host_name'])))
print(len(set(df_filtered['virus_name'])))


5008
787
4675
