In [6]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import grequests
import urllib.parse
from tqdm import tqdm
import os 
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [11]:
repository_path = '/home/user/Documents/Audrey/gitrepo/ornamental/clematis_dbgi/pos/results/visualisation'
canopus_table_path = '/home/user/Documents/Audrey/large files/clematis_dbgi/clematis_10_species_231128_qtof/canopus_formula_summary_adducts.tsv' 
annotations_table_path = '/home/user/Documents/Audrey/large files/clematis_dbgi/clematis_10_species_231128_qtof/compound_identifications_adducts.tsv'

min_ZodiacScore = 0.9             #cut-off filter for considering a sirius annotation valable. It is used in combination with min_ConfidenceScore.
min_ConfidenceScore= 0.0         #cut-off filter for considering a sirius annotation valable. '0.0' as default.
min_class_confidence = 0.8       #cut-off filter for considering a sirius class valable. It is used in combination with min_recurrence.


### Fonction sunburst def

In [12]:
def sunburst_count_plotter(repository_path, canopus_df, organism):
            
    canopus_df = canopus_df[['row ID','NPC#pathway', 'NPC#superclass', 'NPC#class']]
    canopus_df = canopus_df.replace({np.nan:'None'})

    #generate treemap        
    fig1 = px.treemap(canopus_df, path=['NPC#pathway', 'NPC#superclass', 'NPC#class'],
                    color='NPC#pathway',
                    color_discrete_map={
                        'Terpenoids':'#AA4444',
                        'Alkaloids': '#4477AA',
                        'Amino acids and Peptides': '#DDCC77',
                        'Polyketides': '#AAA744',
                        'Shikimates and Phenylpropanoids': 'CF63A5',
                        'Fatty acids': '#882255',
                        'Carbohydrates': '#F4A261',})
    fig1.update_layout(margin = dict(t=50, l=25, r=25, b=25),
    title_text=" ("  +  organism + ") " + "- metabolite annotation overview (size proportional to number of annotations)")
    fig1.update_annotations(font_size=18, font_family="sans-serif")
    

    #generate sunburst

    fig2 = px.sunburst(canopus_df, path=['NPC#pathway', 'NPC#superclass', 'NPC#class'],
                    color='NPC#pathway',
                    color_discrete_map={
                        'Terpenoids':'#AA4444',
                        'Alkaloids': '#4477AA',
                        'Amino acids and Peptides': '#DDCC77',
                        'Polyketides': '#AAA744',
                        'Shikimates and Phenylpropanoids': 'CF63A5',
                        'Fatty acids': '#882255',
                        'Carbohydrates': '#F4A261',})
    fig2.update_layout(margin = dict(t=50, l=25, r=25, b=25),
    title_text= " ("  +  organism + ") " + "- metabolite annotation overview (size proportional to number of annotations)")
    fig2.update_annotations(font_size=18, font_family="sans-serif")
    
    path = os.path.normpath(repository_path)
    pathout = os.path.join(path, 'results/')
    os.makedirs(pathout, exist_ok=True)
    pathout_treemap = os.path.join(pathout, 'clematis_10_species_crop_treemap_pos_qtof.html')
    pathout_sunburst = os.path.join(pathout, 'clematis_10_species_crop_sunburst_pos_qtof.html')
    fig1.write_html(pathout_treemap)
    fig2.write_html(pathout_sunburst)

### Canopus

In [13]:
canopus_df = pd.read_csv(canopus_table_path, sep='\t')
canopus_df = canopus_df[['id', 'molecularFormula', 'adduct', 'NPC#pathway',
       'NPC#pathway Probability', 'NPC#superclass',
       'NPC#superclass Probability', 'NPC#class', 'NPC#class Probability']]
canopus_df.rename(columns={'NPC#class Probability': 'classProbability'}, inplace=True) 
canopus_df['shared name'] = canopus_df['id'].str.split('_').str[-1].astype(int)
canopus_df.drop('id', axis=1, inplace=True)
canopus_df.rename(columns={'shared name': 'row ID', 'adduct': 'adduct (sirius)', 'molecularFormula': 'MF (sirius)', 'name': 'Compound name (sirius)'}, inplace=True) 
canopus_df.drop(canopus_df[canopus_df.classProbability > min_class_confidence].index, inplace=True)
canopus_df.drop(['classProbability', 'NPC#superclass Probability', 'NPC#pathway Probability'], axis=1, inplace=True)

#aggregate features 
agg_func = {'adduct (sirius)': set, 'MF (sirius)': 'first', 'NPC#pathway': 'first', 'NPC#superclass': 'first', 'NPC#class': 'first'}
canopus_df = canopus_df.groupby('row ID', as_index=False).agg(agg_func)


In [14]:
#total number of unique molecular formulas
canopus_df.shape[0]

364

In [15]:
dfg = canopus_df.groupby(['NPC#pathway']).count()
dfg


Unnamed: 0_level_0,row ID,adduct (sirius),MF (sirius),NPC#superclass,NPC#class
NPC#pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alkaloids,105,105,105,105,105
Amino acids and Peptides,47,47,47,47,47
Carbohydrates,15,15,15,15,15
Fatty acids,44,44,44,44,44
Polyketides,19,19,19,19,19
Shikimates and Phenylpropanoids,62,62,62,62,62
Terpenoids,72,72,72,72,72


In [16]:
sample_dir='Set Clematis 10 species'
organism='Q-tof positive ionization mode'
sunburst_count_plotter(repository_path, canopus_df, organism)
