# Sunburst plot for lineage of organisms in dataset

In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
path = "/nfs/scratch/pinder/negative_dataset/my_repository/datasets/no_duplicates/deleak_cdhit/sunburst_data"
count_path = os.path.join(path, "uniprot_species_distribution.tsv.new")
lineage_path = os.path.join(path, "uniprot_lineage.tsv")

In [3]:
count_df = pd.read_csv(count_path, sep="\t")

count_df.head()

Unnamed: 0,Species,Tax ID,Count,Total Count
0,Homo sapiens,9606,3872,51076
1,Saccharomyces cerevisiae,559292,1450,26367
2,Escherichia coli,83333,1092,8547
3,Mus musculus,10090,999,8672
4,Trypanosoma brucei,185431,437,3602


In [4]:
lineage_df = pd.read_csv(lineage_path, sep="\t")

lineage_df.head()

Unnamed: 0,tax_id,scientific_name,no rank,domain,kingdom,phylum,subphylum,superclass,class,superorder,order,suborder,infraorder,parvorder,superfamily,family,subfamily,genus,species
0,9606,Homo sapiens,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,Sarcopterygii,Mammalia,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea,Hominidae,Homininae,Homo,Homo sapiens
1,559292,Saccharomyces cerevisiae (strain ATCC 204508 /...,cellular organisms,Eukaryota,Fungi,Ascomycota,Saccharomycotina,,Saccharomycetes,,Saccharomycetales,,,,,Saccharomycetaceae,,Saccharomyces,Saccharomyces cerevisiae
2,83333,Escherichia coli (strain K12),cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,,,Gammaproteobacteria,,Enterobacterales,,,,,Enterobacteriaceae,,Escherichia,Escherichia coli
3,10090,Mus musculus,cellular organisms,Eukaryota,Metazoa,Chordata,Craniata,Sarcopterygii,Mammalia,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Mus,Mus musculus
4,185431,Trypanosoma brucei brucei (strain 927/4 GUTat1...,cellular organisms,Eukaryota,,Euglenozoa,,,Kinetoplastea,,Trypanosomatida,,,,,Trypanosomatidae,,Trypanosoma,Trypanosoma brucei


In [5]:
# Merge dataframes on tax_id
merged_df = pd.merge(count_df, lineage_df, left_on="Tax ID", right_on="tax_id")
merged_df.head()

Unnamed: 0,Species,Tax ID,Count,Total Count,tax_id,scientific_name,no rank,domain,kingdom,phylum,...,superorder,order,suborder,infraorder,parvorder,superfamily,family,subfamily,genus,species
0,Homo sapiens,9606,3872,51076,9606,Homo sapiens,cellular organisms,Eukaryota,Metazoa,Chordata,...,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea,Hominidae,Homininae,Homo,Homo sapiens
1,Saccharomyces cerevisiae,559292,1450,26367,559292,Saccharomyces cerevisiae (strain ATCC 204508 /...,cellular organisms,Eukaryota,Fungi,Ascomycota,...,,Saccharomycetales,,,,,Saccharomycetaceae,,Saccharomyces,Saccharomyces cerevisiae
2,Escherichia coli,83333,1092,8547,83333,Escherichia coli (strain K12),cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,...,,Enterobacterales,,,,,Enterobacteriaceae,,Escherichia,Escherichia coli
3,Mus musculus,10090,999,8672,10090,Mus musculus,cellular organisms,Eukaryota,Metazoa,Chordata,...,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Mus,Mus musculus
4,Trypanosoma brucei,185431,437,3602,185431,Trypanosoma brucei brucei (strain 927/4 GUTat1...,cellular organisms,Eukaryota,,Euglenozoa,...,,Trypanosomatida,,,,,Trypanosomatidae,,Trypanosoma,Trypanosoma brucei


In [6]:
for _, row in merged_df.iterrows():
    if str(row["Species"]).lower() != str(row["species"]).lower():
        print(f"Mismatch: {row['Species']} vs {row['species']}")

Mismatch: Chaetomium thermophilum vs Thermochaetoides thermophila
Mismatch: Influenza A vs Alphainfluenzavirus influenzae
Mismatch: Salmonella typhimurium vs Salmonella enterica
Mismatch: Human immunodeficiency vs Lentivirus humimdef1
Mismatch: Escherichia phage vs Peduovirus P2
Mismatch: Enterobacteria phage vs Enterobacteria phage M13
Mismatch: Synechocystis sp. vs Synechocystis sp. PCC 6803
Mismatch: Pseudomonas phage vs Casadabanvirus DMS3
Mismatch: Vaccinia virus vs Orthopoxvirus vaccinia
Mismatch: Human herpesvirus vs Rhadinovirus humangamma8
Mismatch: Mycoplasma pneumoniae vs Mycoplasmoides pneumoniae
Mismatch: Nostoc sp. vs Nostoc sp. (strain PCC 7120 / SAG 25.82 / UTEX 2576)
Mismatch: Foot-and-mouth disease vs Foot-and-mouth disease virus
Mismatch: Human adenovirus vs Human mastadenovirus C
Mismatch: Human cytomegalovirus vs Cytomegalovirus humanbeta5
Mismatch: Bacillus phage vs Salasvirus phi29
Mismatch: Human enterovirus vs Enterovirus A
Mismatch: Pseudomonas fluorescens vs 

In [7]:
# Clean lineage by removing associations following NA values of higher ranks
ranks = ['no rank', 'domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

# convert blanks to NA so the hierarchy stops cleanly
merged_df[ranks] = merged_df[ranks].replace('', pd.NA)

for idx, row in merged_df.iterrows():
    na_found = False
    for rank in ranks:
        if na_found:
            merged_df.at[idx, rank] = pd.NA
        elif pd.isna(row[rank]):
            na_found = True

merged_df.head(n=20)


Unnamed: 0,Species,Tax ID,Count,Total Count,tax_id,scientific_name,no rank,domain,kingdom,phylum,...,superorder,order,suborder,infraorder,parvorder,superfamily,family,subfamily,genus,species
0,Homo sapiens,9606,3872,51076,9606,Homo sapiens,cellular organisms,Eukaryota,Metazoa,Chordata,...,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea,Hominidae,Homininae,Homo,Homo sapiens
1,Saccharomyces cerevisiae,559292,1450,26367,559292,Saccharomyces cerevisiae (strain ATCC 204508 /...,cellular organisms,Eukaryota,Fungi,Ascomycota,...,,Saccharomycetales,,,,,Saccharomycetaceae,,Saccharomyces,Saccharomyces cerevisiae
2,Escherichia coli,83333,1092,8547,83333,Escherichia coli (strain K12),cellular organisms,Bacteria,Pseudomonadati,Pseudomonadota,...,,Enterobacterales,,,,,Enterobacteriaceae,,Escherichia,Escherichia coli
3,Mus musculus,10090,999,8672,10090,Mus musculus,cellular organisms,Eukaryota,Metazoa,Chordata,...,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Mus,Mus musculus
4,Trypanosoma brucei,185431,437,3602,185431,Trypanosoma brucei brucei (strain 927/4 GUTat1...,cellular organisms,Eukaryota,,,...,,,,,,,,,,
5,Tetrahymena thermophila,312017,428,4421,312017,Tetrahymena thermophila (strain SB210),cellular organisms,Eukaryota,,,...,,,Tetrahymenina,,,,,,,
6,Arabidopsis thaliana,3702,420,2747,3702,Arabidopsis thaliana,cellular organisms,Eukaryota,Viridiplantae,Streptophyta,...,,Brassicales,,,,,Brassicaceae,,Arabidopsis,Arabidopsis thaliana
7,Pseudomonas aeruginosa,208964,377,1455,208964,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Pseudomonas aeruginosa group,Bacteria,Pseudomonadati,Pseudomonadota,...,,Pseudomonadales,,,,,Pseudomonadaceae,,Pseudomonas,Pseudomonas aeruginosa
8,Bos taurus,9913,349,8274,9913,Bos taurus,cellular organisms,Eukaryota,Metazoa,Chordata,...,Laurasiatheria,Artiodactyla,Ruminantia,Pecora,,,Bovidae,Bovinae,Bos,Bos taurus
9,Rattus norvegicus,10116,321,4181,10116,Rattus norvegicus,cellular organisms,Eukaryota,Metazoa,Chordata,...,Euarchontoglires,Rodentia,Myomorpha,,,,Muridae,Murinae,Rattus,Rattus norvegicus


In [8]:
px.sunburst(
    merged_df,
    path=['domain', 'kingdom', 'phylum', 'class', 'order', 'species'],
    values='Count',
    title="Sunburst plot of species distribution",
    color_continuous_scale='orrd',
    height=800,
    width=800
)

In [9]:
px.sunburst(
    merged_df,
    path=['domain', 'kingdom', 'phylum', 'class', 'order', 'species'],
    values='Total Count',
    title="Sunburst plot of species distribution",
    color_continuous_scale='orrd',
    height=800,
    width=800
)