In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
# from utils.fastas import *
# from utils.parse_fasta import parse_fasta

fna_dir = '/mnt/raid7/Dachuang/Achuan/03_phage_host/db/host'
blastn_dir = '/mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/03_blastn_virus'
prophgae_dir = '/mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/02_prophage'
output_dir='/mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/05_calculate_filtered'
virus_json=json.load(open(f'/mnt/raid7/Dachuang/Achuan/03_phage_host/db/Virus_seqlen.json'))

virus_host_dict = json.load(
    open(f'/mnt/raid7/Dachuang/Achuan/03_phage_host/db/gold_virus_info.json'))
virus_all_ids=list(virus_host_dict.keys())

def mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
mkdir(output_dir)

In [2]:

def calculate_prophage(filename):
    """
    calculate the evevry prophage score
    """
    from Bio import SeqIO
    from pandas.errors import EmptyDataError
    # read blastn file
    try:
        df = pd.read_table(f"{blastn_dir}/{filename}/{filename}.tab", sep="\t", header=None)
    except EmptyDataError:
        return

    # add blastn output info
    df.columns = ["query", "subject", "identity", "alignment_length", "mismatches",
                "gap_openings", "q_start", "q_end", "s_start", "s_end", "evalue", "bit_score"]
    df = df[(df["identity"] > 90.0) & (df["alignment_length"] > 500)].copy()

    # 先统计指标：alignment_length*identity
    calculate_dict = {}
    # 计算prophage 长度
    # pro_fasta_dict = SeqIO.to_dict(SeqIO.parse(
    #     open(f'{prophgae_dir}/{filename}'), 'fasta'))
    for i in range(len(df)):
        # 防止之前不完整的id在列表中
        virus_id = df.iloc[i]['subject'].split('.')[0]
        # prophage_id=df.iloc[i]['query']
        # prophage_length = len(pro_fasta_dict[prophage_id].seq)
        if virus_id not in virus_all_ids:
            continue
        # if df.iloc[i]['alignment_length']/prophage_length<0.3:
        #     continue
        key = df.iloc[i]['subject'].strip()
        if key not in calculate_dict:
            calculate_dict[key] = {'single': []}
        # calculate_dict[key]['single'].append(
        #     df.iloc[i]['alignment_length']*df.iloc[i]['identity']/(df.iloc[i]['mismatches']+1))
        calculate_dict[key]['single'].append(
            df.iloc[i]['alignment_length']*df.iloc[i]['identity']/100)


    # 计算最终指标prophage_score=sum(alignment_length*identity)/prophage_length

    for key in calculate_dict:
        virus_id = key
        virus_length = virus_json[virus_id]
        """coverage因为prophage length可能比virus length大，不能说明好坏,所以取消这个指标
        prophage_length = len(pro_fasta_dict[prophage_id].seq)
        coverage = prophage_length/virus_length
        calculate_dict[key]['score'] = sum(
            calculate_dict[key]['single'])/virus_length*coverage
        """
        # calculate_dict[key]['score'] = sum(
        #     calculate_dict[key]['single'])/virus_length
        """尝试多个的添加系数
        """
        duplicate_number = len(calculate_dict[key]['single'])
        calculate_dict[key]['score'] = np.max(
            calculate_dict[key]['single'])/virus_length*(duplicate_number*0.1+1)


    # calculate_dict to table
    sorted_dict = dict(sorted(calculate_dict.items(),
                    key=lambda x: x[1]['score'], reverse=True))
    # z-score = (x-mean)/std
    # all_scores = []
    # for key in sorted_dict:
    #     all_scores.append(sorted_dict[key]['score'])
    # all_scores=np.array(all_scores)
    # for key in sorted_dict:
    #     sorted_dict[key]['score'] = (
    #         sorted_dict[key]['score']-np.min(all_scores))/(np.max(all_scores)-np.min(all_scores))
    with open(f'{output_dir}/{filename}.tsv', 'w') as f:
        for key in sorted_dict:
            virus_id = key
            score = sorted_dict[key]['score']
            f.write(f'{filename}\t{virus_id}\t{score}\n')


In [3]:
! rm /mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/05_calculate_filtered/*

In [4]:
files = os.listdir(f"{blastn_dir}")
for filename in tqdm(files):
    calculate_prophage(filename)



100%|██████████| 495/495 [00:23<00:00, 20.85it/s]


## 合并所有文件


In [5]:
! cat /mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/05_calculate_filtered/*.tsv > /mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/06_score/all_score_filtered.tsv

## 解析all_score.tsv

- task1： 获取每个virus-host 的 prophage score，对于每个virus找把分数最高的host当做是病毒的host
- task2： 给每个virus-host 分配标签，1代表正确，0代表错误

In [6]:
import pandas as pd
df_score=pd.read_table(
    '/mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/06_score/all_score_filtered.tsv', header=None)
df_score.columns = ['host', 'virus', 'score']

df_score


Unnamed: 0,host,virus,score
0,GCF_000005845.2,NC_049953.1,0.121097
1,GCF_000005845.2,NC_049955.1,0.107498
2,GCF_000005845.2,NC_042057.1,0.097676
3,GCF_000005845.2,NC_019723.1,0.090618
4,GCF_000005845.2,NC_019711.1,0.090304
...,...,...,...
3112,GCF_902387845.1,NC_019723.1,0.060082
3113,GCF_902387845.1,NC_019711.1,0.059831
3114,GCF_902387845.1,NC_001416.1,0.058308
3115,GCF_902387845.1,NC_049951.1,0.029198


In [7]:
import numpy as np
# task1： 获取每个virus-host 的 prophage score，对于每个virus找把分数最高的host当做是病毒的host
""" 不重复
df_only=df_score.sort_values(by='score', ascending=False).groupby(
    'virus', as_index=False).first()
df_only
"""
"""重复
"""
df_score["rank"] = df_score.groupby("virus")["score"].rank(
    method="min", ascending=False).astype(np.int64)
df_only = df_score[df_score["rank"] == 1][["virus", "host", "score"]]
df_only


Unnamed: 0,virus,host,score
50,NC_002667.1,GCF_000006865.1,1.012755
51,NC_002668.1,GCF_000006865.1,0.874772
52,NC_002671.1,GCF_000006865.1,0.752724
53,NC_002666.1,GCF_000006865.1,0.672722
54,NC_002669.1,GCF_000006865.1,0.557117
...,...,...,...
3086,NC_027341.1,GCF_022494905.1,0.014574
3087,NC_029119.1,GCF_022531845.1,0.110021
3088,NC_019915.1,GCF_022531845.1,0.039915
3097,NC_024365.1,GCF_900235835.1,0.164177


In [8]:
# task2： 给每个virus-host 分配标签，1代表正确，0代表错误
import json

host_lineage_dict = json.load(
    open(f'/mnt/raid7/Dachuang/Achuan/03_phage_host/db/gcf_dict.json'))
virus_host_dict = json.load(
    open(f'/mnt/raid7/Dachuang/Achuan/03_phage_host/db/gold_virus_info.json'))

def species_tag(df):
    host_id = df['host']
    virus_id = df['virus'].split('.')[0]
    host_taxonomy = host_lineage_dict[host_id].split(';')[6].split('__')[1]
    virus_taxonomy_list = virus_host_dict[virus_id]['split_lineage']["species"]
    if host_taxonomy in virus_taxonomy_list:
        return 1
    else:
        return 0


def genus_tag(df):
    host_id = df['host']
    virus_id = df['virus'].split('.')[0]
    host_taxonomy = host_lineage_dict[host_id].split(';')[5].split('__')[1]
    virus_taxonomy_list = virus_host_dict[virus_id]['split_lineage']["genus"]
    if host_taxonomy in virus_taxonomy_list:
        return 1
    else:
        return 0


In [9]:

# 给df_score 分配标签，1代表正确，0代表错误
df_score['species_tag'] = df_score.apply(lambda x: species_tag(x), axis=1)
df_score['genus_tag'] = df_score.apply(lambda x: genus_tag(x), axis=1)
df_score.to_csv(
    '/mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/06_score/all_score_filtered_tag.tsv', sep='\t', index=False)


In [10]:

# 给df_only 分配标签，1代表正确，0代表错误
df_only['species_tag'] = df_only.apply(lambda x: species_tag(x), axis=1)
df_only['genus_tag']=df_only.apply(lambda x:genus_tag(x),axis=1)
df_only.to_csv(
    '/mnt/raid7/Dachuang/Achuan/03_phage_host/02_Prophage/06_score/only_score_filtered_tag.tsv', sep='\t', index=False)


In [11]:
df_only

Unnamed: 0,virus,host,score,species_tag,genus_tag
50,NC_002667.1,GCF_000006865.1,1.012755,1,1
51,NC_002668.1,GCF_000006865.1,0.874772,1,1
52,NC_002671.1,GCF_000006865.1,0.752724,1,1
53,NC_002666.1,GCF_000006865.1,0.672722,1,1
54,NC_002669.1,GCF_000006865.1,0.557117,1,1
...,...,...,...,...,...
3086,NC_027341.1,GCF_022494905.1,0.014574,1,1
3087,NC_029119.1,GCF_022531845.1,0.110021,0,1
3088,NC_019915.1,GCF_022531845.1,0.039915,1,1
3097,NC_024365.1,GCF_900235835.1,0.164177,1,1


In [12]:
len(df_score.loc[df_score['species_tag'] == 1, 'virus'].unique())

548

In [13]:
from utils.deal import *
print("all")


summary(df_score)


all


Unnamed: 0,病毒数目,宿主数目,总样本,正样本,负样本,正确率,找到正确宿主的病毒,病毒Recover-in,病毒Recover-all
species,633,166,3117,1932,1185,0.6198,548,0.8657,0.1276
genus,633,166,3117,2205,912,0.7074,611,0.9652,0.1423


In [14]:
print("only")

summary(df_only)


only


Unnamed: 0,病毒数目,宿主数目,总样本,正样本,负样本,正确率,找到正确宿主的病毒,病毒Recover-in,病毒Recover-all
species,633,106,635,513,122,0.8079,511,0.8073,0.119
genus,633,106,635,596,39,0.9386,594,0.9384,0.1383
