## 根据条目表格生成LCA dict

In [13]:
LCA_dict = {}
with open('virushostdb_output(filtered).tsv', 'r') as f:
    # 跳过下一行
    f.readline()
    for line in f:
        line = line.strip().split('\t')
        virus_name = line[0]
        virus_id = line[1]
        host_num = 1
        # 用find host name做LCA分析
        host_info = {'host_name': line[7], 'host_taxid': line[12],
                     'host_rank': line[13], 'host_lineage': line[14]}
        if virus_name not in LCA_dict:
            LCA_dict[virus_name] = {'virus_id': virus_id,
                                    'host_num': host_num, 'host_info': [host_info]}
        else:
            LCA_dict[virus_name]['host_num'] += 1
            LCA_dict[virus_name]['host_info'].append(host_info)


In [14]:
from collections import Counter

# 初始化一个字典，用来存放每个索引的taxonomy level
taxonomy_level = {7: 'strain', 6: 'species', 5: 'genus',
                  4: 'family', 3: 'order', 2: 'class', 1: 'phylum', 0: 'kingdom'}
for virus_name in LCA_dict:
    virus_info = LCA_dict[virus_name]
    host_num = virus_info['host_num']
    # 先把每个病毒的host lineage合并为表格，方便后面统计
    lineage_list = [host_info['host_lineage']
                    for host_info in virus_info['host_info']]
    # 从右往左数，获取各个层级的最多相同次数，一旦哪个最多相同次数为host num数就是LCA level
    for i in taxonomy_level.keys():
        taxonomy_list = [lineage.split(';')[i].split('__')[1]
                         for lineage in lineage_list]
        maxcounttax = Counter(taxonomy_list).most_common(1)[0][0]
        maxcount = Counter(taxonomy_list).most_common(1)[0][1]
        if maxcount == host_num and maxcounttax:
            lca_level = taxonomy_level[i]
            lca_name = maxcounttax
            lca_lineage = ';'.join(lineage_list[0].split(';')[:i+1])
            break
    # 把LCA 信息补充放入字典
    LCA_dict[virus_name]['lca_level'] = lca_level
    LCA_dict[virus_name]['lca_name'] = lca_name
    LCA_dict[virus_name]['lca_lineage'] = lca_lineage


In [15]:
import json
json_string = json.dumps(LCA_dict, indent=4)
with open('virushostdb_output_self_lca.json', 'w') as f:
    f.write(json_string)


In [16]:
import pandas as pd
table=[]
for virus_name in LCA_dict.keys():
    virus_info = LCA_dict[virus_name]
    line_dict = {"virus_name": virus_name,'virus_id':virus_info['virus_id'],'host_num':virus_info['host_num'],'lca_level':virus_info['lca_level'],'lca_name':virus_info['lca_name'], 'lca_lineage':virus_info['lca_lineage']}
    table.append(line_dict)

df_lca = pd.DataFrame(table)
df_lca.to_csv('virushostdb_output_self_lca.tsv', sep='\t', index=False)


In [17]:
from pyecharts.charts import Pie
from pyecharts import options as opts

def draw_pie(df_,title):
    counts = list(zip(df_.iloc[:,0],df_.iloc[:,1]))
    word_counts = sorted(counts, key=lambda x: x[1], reverse=True)

    # 绘图
    pie = (
        #创建一个饼图，初始化大小
        Pie(init_opts=opts.InitOpts(width="1000px", height="500px")) 
        # 添加数据
        .add(
            "",  
            word_counts # 数据
        )
        # 设置标签格式，b-名称，c是计数，d是概率
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}({d}%)"))
        .set_global_opts(
            #标题
            title_opts=opts.TitleOpts(title="饼图"),
            # 图例的布局朝向。可选：'horizontal', 'vertical'
            legend_opts=opts.LegendOpts(
                orient="vertical", pos_top="0%",pos_right="8%" # 图例设置
            ),
            #工具栏
            toolbox_opts=opts.ToolboxOpts(
                is_show=True,
                pos_top="bottom",
                pos_left="right",
                feature={
                    "saveAsImage": {},
                    "dataZoom": {"yAxisIndex": "none"},
                    "restore": {},
                    "magicType": {"show": True, "type": ["line", "bar"]},
                    "dataView": {}}
            )
        )

    )
    pie.render(f'{title}.html')  # 保存到本地
    pie.render_notebook()


## lca 统计

In [18]:
prefix = 'virushostdb_output_self_lca'
# 统计host num
txt = df_lca['host_num'].value_counts()
df_ = pd.DataFrame({'Host Num': txt.index, 'Count': txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
df_.to_csv(f'{prefix}.num.tsv', sep='\t', index=False)
df_


Unnamed: 0,Host Num,Count,Percent
0,1,4094,0.9552
1,2,152,0.0355
2,3,14,0.0033
3,4,13,0.003
4,5,5,0.0012
5,6,5,0.0012
6,8,2,0.0005
7,14,1,0.0002


In [19]:
draw_pie(df_,f'{prefix}.num')

In [20]:
# 统计lca_level
txt=df_lca['lca_level'].value_counts()
df_ = pd.DataFrame({'Tax level': txt.index, 'Count': txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
df_.to_csv(f'{prefix}.level.tsv', sep='\t', index=False)
df_


Unnamed: 0,Tax level,Count,Percent
0,species,3020,0.7046
1,strain,1186,0.2767
2,genus,58,0.0135
3,family,16,0.0037
4,order,3,0.0007
5,kingdom,2,0.0005
6,class,1,0.0002


In [21]:
draw_pie(df_, f'{prefix}.level')


In [22]:
# 统计lca_lineage

def get_lineage_rank(x,rank):
    taxonomy_level={'strain':7,'species':6,'genus':5,'family':4,'order':3,'class':2,'phylum':1,'kingdom':0}
    i=taxonomy_level[rank]
    try:
        return x.split(';')[i].split('__')[1]
    except:
        return '-'
f=open(f'{prefix}.lineage.md', 'w')

# kindom
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'kingdom')).value_counts()
df_ = pd.DataFrame({'kingdom':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## kingdom\n')
f.write(df_.to_csv(sep='\t',index=False))
# phylum
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'phylum')).value_counts()
df_ = pd.DataFrame({'phylum':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## phylum\n')
f.write(df_.to_csv(sep='\t',index=False))

# class
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'class')).value_counts()
df_ = pd.DataFrame({'class':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## class\n')
f.write(df_.to_csv(sep='\t',index=False))

# order
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'order')).value_counts()
df_ = pd.DataFrame({'order':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## order\n')
f.write(df_.to_csv(sep='\t',index=False))

# family
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'family')).value_counts()
df_ = pd.DataFrame({'family':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## family\n')
f.write(df_.to_csv(sep='\t',index=False))


# genus
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'genus')).value_counts()
df_ = pd.DataFrame({'genus':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## genus\n')
f.write(df_.to_csv(sep='\t',index=False))


# species
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'species')).value_counts()
df_ = pd.DataFrame({'species':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## species\n')
f.write(df_.to_csv(sep='\t',index=False))


# strain
txt=df_lca['lca_lineage'].apply(lambda x: get_lineage_rank(x,'strain')).value_counts()
df_ = pd.DataFrame({'strain':txt.index, 'Count':txt.values})
df_['Percent'] = (df_['Count']/df_['Count'].sum()).round(4)
f.write('## strain\n')
f.write(df_.to_csv(sep='\t',index=False))
f.close()