In [None]:
from Bio import Entrez
import re
import sys

# 检查当前Python解释器位置
# print(f"当前Python解释器位置: {sys.executable}")

# 输入你的email
Entrez.email = "yiheng.du@anu.edu.au"


def get_species_info(accession):
    handle = Entrez.esummary(db="nucleotide", id=accession)
    record = Entrez.read(handle)
    handle.close()
    species = record[0]["Title"]
    return species


In [None]:
# 解析BLAST结果文件
blast_results_file = 'blast_results.out'
output_file = 'blast_results_with_species.txt'

# 用于存储物种信息的字典
species_info = {}

with open(blast_results_file, 'r') as file:
    lines = file.readlines()
    for line in lines:
        fields = line.strip().split('\t')
        if len(fields) > 1:
            subject_id = fields[1]
            accession = subject_id.split('.')[0]
            if accession not in species_info:
                species_info[accession] = get_species_info(accession)




In [None]:
# 将结果写入文件
with open(output_file, 'w') as outfile:
    outfile.write("Accession\tSpecies\n")
    for accession, species in species_info.items():
        outfile.write(f"{accession}\t{species}\n")

print(f"结果已写入文件: {output_file}")

# 筛选比对的结果，每个样本使用

In [6]:
import pandas as pd

def load_and_process_blast_results(file_path):
    # Load BLAST results into a DataFrame
    data = pd.read_csv(file_path, sep='\t', header=None, names=[
        "Query ID", "Subject ID", "Identity %", "Alignment Length", "Mismatches",
        "Gap Opens", "Q. Start", "Q. End", "S. Start", "S. End", "E-value", "Bit Score"])

    # Group by 'Query ID' and select the row with the highest 'Identity %' for each query
    top_matches_per_query = data.groupby('Query ID').apply(lambda x: x.nlargest(1, 'Identity %'))

    # Reset index to clean up the DataFrame
    top_matches_per_query.reset_index(drop=True, inplace=True)
    
    return top_matches_per_query

# Specify the path to your BLAST result file
file_path = 'blast_results.out'  # Replace 'path_to_your_file.out' with your actual file path

# Process the file
top_matches = load_and_process_blast_results(file_path)

# Print the top matches for verification
print(top_matches.head())

# Optionally, save the filtered results to a new file
# top_matches.to_csv('blast_results.otu', index=False)


                        Query ID              Subject ID  Identity %  \
0  gi|1003337407|emb|LN998017.1|  CP000975.892025.893540     100.000   
1   gi|1004613716|gb|KT906995.1|         JN535010.1.1474     100.000   
2   gi|1004613717|gb|KT906996.1|         JN535010.1.1474     100.000   
3   gi|1020273363|gb|KX010383.1|         KX823598.1.1500      97.410   
4   gi|1020273372|gb|KX010392.1|         JN868997.1.1538      99.707   

   Alignment Length  Mismatches  Gap Opens  Q. Start  Q. End  S. Start  \
0              1516           0          0        10    1525         1   
1                38           0          0         1      38         9   
2                38           0          0         1      38         9   
3               502          13          0       197     698       230   
4               682           2          0         1     682        53   

   S. End       E-value  Bit Score  
0    1516  0.000000e+00     2800.0  
1      46  3.120000e-10       71.3  
2      46  

In [25]:
top_matches

Unnamed: 0,Query ID,Subject ID,Identity %,Alignment Length,Mismatches,Gap Opens,Q. Start,Q. End,S. Start,S. End,E-value,Bit Score,Subject ID1
0,gi|1003337407|emb|LN998017.1|,CP000975.892025.893540,100.000,1516,0,0,10,1525,1,1516,0.000000e+00,2800.0,CP000975
1,gi|1004613716|gb|KT906995.1|,JN535010.1.1474,100.000,38,0,0,1,38,9,46,3.120000e-10,71.3,JN535010
2,gi|1004613717|gb|KT906996.1|,JN535010.1.1474,100.000,38,0,0,1,38,9,46,3.120000e-10,71.3,JN535010
3,gi|1020273363|gb|KX010383.1|,KX823598.1.1500,97.410,502,13,0,197,698,230,731,0.000000e+00,856.0,KX823598
4,gi|1020273372|gb|KX010392.1|,JN868997.1.1538,99.707,682,2,0,1,682,53,734,0.000000e+00,1249.0,JN868997
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4992,gi|973746054|gb|KU498241.1|,KU498241.1.1537,100.000,1537,0,0,1,1537,1,1537,0.000000e+00,2839.0,KU498241
4993,gi|974142236|ref|NR_134756.1|,HF947551.1.1410,100.000,1410,0,0,1,1410,1,1410,0.000000e+00,2604.0,HF947551
4994,gi|984880335|emb|LN876463.1|,FPLS01049704.1.1419,98.424,825,13,0,37,861,2,826,0.000000e+00,1469.0,FPLS01049704
4995,gi|985700639|emb|LN995858.1|,KY190907.1.1454,92.070,1261,82,15,178,1427,201,1454,0.000000e+00,1759.0,KY190907


# Add the spcies name

In [27]:
species_file_path = 'blast_results_with_species.txt'





In [18]:
top_matches

Unnamed: 0,Query ID,Subject ID,Identity %,Alignment Length,Mismatches,Gap Opens,Q. Start,Q. End,S. Start,S. End,E-value,Bit Score
0,gi|1003337407|emb|LN998017.1|,CP000975.892025.893540,100.000,1516,0,0,10,1525,1,1516,0.000000e+00,2800.0
1,gi|1004613716|gb|KT906995.1|,JN535010.1.1474,100.000,38,0,0,1,38,9,46,3.120000e-10,71.3
2,gi|1004613717|gb|KT906996.1|,JN535010.1.1474,100.000,38,0,0,1,38,9,46,3.120000e-10,71.3
3,gi|1020273363|gb|KX010383.1|,KX823598.1.1500,97.410,502,13,0,197,698,230,731,0.000000e+00,856.0
4,gi|1020273372|gb|KX010392.1|,JN868997.1.1538,99.707,682,2,0,1,682,53,734,0.000000e+00,1249.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4992,gi|973746054|gb|KU498241.1|,KU498241.1.1537,100.000,1537,0,0,1,1537,1,1537,0.000000e+00,2839.0
4993,gi|974142236|ref|NR_134756.1|,HF947551.1.1410,100.000,1410,0,0,1,1410,1,1410,0.000000e+00,2604.0
4994,gi|984880335|emb|LN876463.1|,FPLS01049704.1.1419,98.424,825,13,0,37,861,2,826,0.000000e+00,1469.0
4995,gi|985700639|emb|LN995858.1|,KY190907.1.1454,92.070,1261,82,15,178,1427,201,1454,0.000000e+00,1759.0


In [35]:
df=merged_data
unique_species = df['Species'].unique()

# 将唯一物种转换为DataFrame
unique_species_df = pd.DataFrame(unique_species, columns=['Unique Species'])
unique_species_df

Unnamed: 0,Unique Species
0,Uncultured bacterium clone 1-6 16S ribosomal R...
1,Uncultured Verrucomicrobia bacterium clone KWK...
2,Uncultured bacterium clone AKAU3464 16S riboso...
3,Uncultured Spartobacteria bacterium clone CA19...
4,Uncultured bacterium clone NC2 16S ribosomal R...
...,...
1042,Uncultured bacterium clone PZ1G2 16S ribosomal...
1043,Uncultured Verrucomicrobium sp. clone W2-1 16S...
1044,Uncultured bacterium clone SING824 16S ribosom...
1045,Uncultured bacterium clone G90 16S ribosomal R...


In [22]:
species_data = pd.read_csv(species_file_path, sep='\t', header=0)
top_matches['Subject ID1']=top_matches['Subject ID'].apply(lambda x: x.split('.')[0])
merged_data = pd.merge(top_matches, species_data, left_on='Subject ID1', right_on='Accession')

In [24]:
merged_data.to_csv('merged_blast_results.txt', sep='\t', index=False)

In [36]:
merged_data

Unnamed: 0,Query ID,Subject ID,Identity %,Alignment Length,Mismatches,Gap Opens,Q. Start,Q. End,S. Start,S. End,E-value,Bit Score,Subject ID1,Accession,Species
0,gi|1020273363|gb|KX010383.1|,KX823598.1.1500,97.410,502,13,0,197,698,230,731,0.000000e+00,856.0,KX823598,KX823598,Uncultured bacterium clone 1-6 16S ribosomal R...
1,gi|1031987993|gb|KX270090.1|,KX823598.1.1500,98.570,769,11,0,99,867,230,998,0.000000e+00,1360.0,KX823598,KX823598,Uncultured bacterium clone 1-6 16S ribosomal R...
2,gi|151351578|gb|EF664287.1|,KX823598.1.1500,98.680,606,8,0,185,790,230,835,0.000000e+00,1075.0,KX823598,KX823598,Uncultured bacterium clone 1-6 16S ribosomal R...
3,gi|151352453|gb|EF665162.1|,KX823598.1.1500,96.332,736,21,6,199,929,230,964,0.000000e+00,1205.0,KX823598,KX823598,Uncultured bacterium clone 1-6 16S ribosomal R...
4,gi|151505628|gb|EF650958.1|,KX823598.1.1500,98.738,713,9,0,187,899,230,942,0.000000e+00,1267.0,KX823598,KX823598,Uncultured bacterium clone 1-6 16S ribosomal R...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2963,gi|95115330|gb|DQ501372.1|,KP636094.1.1512,98.000,700,10,4,5,700,799,1498,0.000000e+00,1216.0,KP636094,KP636094,Uncultured bacterium clone PZ1G2 16S ribosomal...
2964,gi|953761809|gb|KT880277.1|,KT880277.1.1503,100.000,1503,0,0,1,1503,1,1503,0.000000e+00,2761.0,KT880277,KT880277,Uncultured Verrucomicrobium sp. clone W2-1 16S...
2965,gi|953761817|gb|KT880281.1|,HM129413.1.1484,100.000,39,0,0,20,58,1,39,8.830000e-11,73.1,HM129413,HM129413,Uncultured bacterium clone SING824 16S ribosom...
2966,gi|969811500|dbj|LC017441.1|,JF429016.1.1540,99.267,409,3,0,1,409,537,945,0.000000e+00,739.0,JF429016,JF429016,Uncultured bacterium clone G90 16S ribosomal R...
