# Sunburst plot for lineage of organisms in dataset

In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
path = "/nfs/scratch/pinder/negative_dataset/my_repository/datasets/no_duplicates/uniprot"
count_path = os.path.join(path, "uniprot_species_distribution.tsv.new")
lineage_path = os.path.join(path, "uniprot_lineage.tsv")

In [3]:
count_df = pd.read_csv(count_path, sep="\t")

count_df.head()

Unnamed: 0,Species,Tax ID,Count
0,Homo sapiens,9606,5562
1,Saccharomyces cerevisiae,559292,1731
2,Mus musculus,10090,1425
3,Escherichia coli,83333,1304
4,Arabidopsis thaliana,3702,575


In [4]:
lineage_df = pd.read_csv(lineage_path, sep="\t")

lineage_df.head()

Unnamed: 0,tax_id,scientific_name,no rank,domain,kingdom,phylum,subphylum,superclass,class,superorder,order,suborder,infraorder,parvorder,superfamily,family,subfamily,genus,species
0,9606,Homo sapiens,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,Sarcopterygii,Mammalia,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea,Hominidae,Homininae,Homo,Homo sapiens
1,559292,Saccharomyces cerevisiae (strain ATCC 204508 /...,cellular organisms,Eukaryota,Fungi,Ascomycota,Saccharomycotina,,Saccharomycetes,,Saccharomycetales,,,,,Saccharomycetaceae,,Saccharomyces,Saccharomyces cerevisiae
2,10090,Mus musculus,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,Sarcopterygii,Mammalia,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Mus,Mus musculus
3,83333,Escherichia coli (strain K12),cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,,,Gammaproteobacteria,,Enterobacterales,,,,,Enterobacteriaceae,,Escherichia,Escherichia coli
4,3702,Arabidopsis thaliana,cellular organisms,Eukaryota,Viridiplantae,Streptophyta,Streptophytina,,Magnoliopsida,,Brassicales,,,,,Brassicaceae,,Arabidopsis,Arabidopsis thaliana


In [5]:
# Merge dataframes on tax_id
merged_df = pd.merge(count_df, lineage_df, left_on="Tax ID", right_on="tax_id")
merged_df.head()

Unnamed: 0,Species,Tax ID,Count,tax_id,scientific_name,no rank,domain,kingdom,phylum,subphylum,...,superorder,order,suborder,infraorder,parvorder,superfamily,family,subfamily,genus,species
0,Homo sapiens,9606,5562,9606,Homo sapiens,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,...,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea,Hominidae,Homininae,Homo,Homo sapiens
1,Saccharomyces cerevisiae,559292,1731,559292,Saccharomyces cerevisiae (strain ATCC 204508 /...,cellular organisms,Eukaryota,Fungi,Ascomycota,Saccharomycotina,...,,Saccharomycetales,,,,,Saccharomycetaceae,,Saccharomyces,Saccharomyces cerevisiae
2,Mus musculus,10090,1425,10090,Mus musculus,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,...,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Mus,Mus musculus
3,Escherichia coli,83333,1304,83333,Escherichia coli (strain K12),cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,,...,,Enterobacterales,,,,,Enterobacteriaceae,,Escherichia,Escherichia coli
4,Arabidopsis thaliana,3702,575,3702,Arabidopsis thaliana,cellular organisms,Eukaryota,Viridiplantae,Streptophyta,Streptophytina,...,,Brassicales,,,,,Brassicaceae,,Arabidopsis,Arabidopsis thaliana


In [6]:
for _, row in merged_df.iterrows():
    if str(row["Species"]).lower() != str(row["species"]).lower():
        print(f"Mismatch: {row['Species']} vs {row['species']}")

Mismatch: Chaetomium thermophilum vs Thermochaetoides thermophila
Mismatch: Influenza A vs Alphainfluenzavirus influenzae
Mismatch: Human immunodeficiency vs Lentivirus humimdef1
Mismatch: Salmonella typhimurium vs Salmonella enterica
Mismatch: Escherichia phage vs Peduovirus P2
Mismatch: Enterobacteria phage vs Enterobacteria phage M13
Mismatch: Synechocystis sp. vs Synechocystis sp. PCC 6803
Mismatch: Human herpesvirus vs Rhadinovirus humangamma8
Mismatch: Nostoc sp. vs Nostoc sp. (strain PCC 7120 / SAG 25.82 / UTEX 2576)
Mismatch: Pseudomonas phage vs Casadabanvirus JBD5
Mismatch: Lactococcus lactis vs Lactococcus lactis subsp. cremoris
Mismatch: Vaccinia virus vs Orthopoxvirus vaccinia
Mismatch: Human cytomegalovirus vs Cytomegalovirus humanbeta5
Mismatch: Epstein-Barr virus vs Lymphocryptovirus humangamma4
Mismatch: Mycoplasma pneumoniae vs Mycoplasmoides pneumoniae
Mismatch: Human adenovirus vs Human mastadenovirus C
Mismatch: Foot-and-mouth disease vs Foot-and-mouth disease viru

In [7]:
# Clean lineage by removing associations following NA values of higher ranks
ranks = ['no rank', 'domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

# convert blanks to NA so the hierarchy stops cleanly
merged_df[ranks] = merged_df[ranks].replace('', pd.NA)

for idx, row in merged_df.iterrows():
    na_found = False
    for rank in ranks:
        if na_found:
            merged_df.at[idx, rank] = pd.NA
        elif pd.isna(row[rank]):
            na_found = True

merged_df.head(n=20)


Unnamed: 0,Species,Tax ID,Count,tax_id,scientific_name,no rank,domain,kingdom,phylum,subphylum,...,superorder,order,suborder,infraorder,parvorder,superfamily,family,subfamily,genus,species
0,Homo sapiens,9606,5562,9606,Homo sapiens,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,...,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea,Hominidae,Homininae,Homo,Homo sapiens
1,Saccharomyces cerevisiae,559292,1731,559292,Saccharomyces cerevisiae (strain ATCC 204508 /...,cellular organisms,Eukaryota,Fungi,Ascomycota,Saccharomycotina,...,,Saccharomycetales,,,,,Saccharomycetaceae,,Saccharomyces,Saccharomyces cerevisiae
2,Mus musculus,10090,1425,10090,Mus musculus,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,...,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Mus,Mus musculus
3,Escherichia coli,83333,1304,83333,Escherichia coli (strain K12),cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,,...,,Enterobacterales,,,,,Enterobacteriaceae,,Escherichia,Escherichia coli
4,Arabidopsis thaliana,3702,575,3702,Arabidopsis thaliana,cellular organisms,Eukaryota,Viridiplantae,Streptophyta,Streptophytina,...,,Brassicales,,,,,Brassicaceae,,Arabidopsis,Arabidopsis thaliana
5,Rattus norvegicus,10116,470,10116,Rattus norvegicus,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,...,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Rattus,Rattus norvegicus
6,Trypanosoma brucei,185431,458,185431,Trypanosoma brucei brucei (strain 927/4 GUTat1...,cellular organisms,Eukaryota,,,,...,,,,,,,,,,
7,Tetrahymena thermophila,312017,457,312017,Tetrahymena thermophila (strain SB210),cellular organisms,Eukaryota,,,Intramacronucleata,...,,,Tetrahymenina,,,,,,,
8,Pseudomonas aeruginosa,208964,441,208964,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Pseudomonas aeruginosa group,Bacteria,Pseudomonadati,Pseudomonadota,,...,,Pseudomonadales,,,,,Pseudomonadaceae,,Pseudomonas,Pseudomonas aeruginosa
9,Drosophila melanogaster,7227,437,7227,Drosophila melanogaster,melanogaster subgroup,Eukaryota,Metazoa,Arthropoda,Hexapoda,...,,Diptera,Brachycera,Muscomorpha,,Ephydroidea,Drosophilidae,Drosophilinae,Drosophila,Drosophila melanogaster


In [9]:
px.sunburst(
    merged_df,
    path=['domain', 'kingdom', 'phylum', 'class', 'order', 'species'],
    values='Count',
    title="Sunburst plot of species distribution",
    color_continuous_scale='orrd',
    height=800,
    width=800
)