# Introduction

To easily find the parameters on the jupyter notebook on the header bar select view -> cell toolbar -> tags. all the paramters will have the tag "parameter".

# Download GNPS Data

In [1]:
# importing necessary modules
import networkx as nx
from gnpsdata import taskresult
import os
from gnpsdata import workflow_fbmn
import pandas as pd
from qiime2 import Visualization
import pandas as pd
import numpy as np
import os
import itertools
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance
from sklearn.decomposition import PCA
import scipy.stats as stats
import pingouin as pg
import skbio # Don't import on Windows!!
from ipyfilechooser import FileChooser
from ipywidgets import interact
from pynmranalysis.normalization import PQN_normalization
import warnings

In [2]:
print("X")

X


In [3]:
# GNPS task id
task = "cf6e14abf5604f47b28b467a513d3532"

In [4]:
# Downloading raw data from GNPS
def download_graphml(task, output_file):
    taskresult.download_task_resultfile(task, "gnps_molecular_network_graphml/", output_file)

def get_graphml_network(task):
    taskresult.download_task_resultfile(task, "gnps_molecular_network_graphml/", "temp.graphml")

    G = nx.read_graphml("temp.graphml")

    return G

def download_quantification(task, output_file):
    taskresult.download_task_resultfile(task, "quantification_table/", output_file)

def download_metadata(task, output_file):
    taskresult.download_task_resultfile(task, "metadata_merged/", output_file)

def download_mgf(task, output_file):
    taskresult.download_task_resultfile(task, "spectra_reformatted/", output_file)
    
# Qiime2 Data
def download_qiime2(task, output_file):
    taskresult.download_task_resultfile(task, "qiime2_output/qiime2_table.qza", output_file)

def download_qiime2_manifest(task, output_file):
    taskresult.download_task_resultfile(task, "qiime2_output/qiime2_manifest.tsv", output_file)

def download_qiime2_metadata(task, output_file):
    taskresult.download_task_resultfile(task, "qiime2_output/qiime2_metadata.tsv", output_file)

In [6]:
# Download quantification and manifest
os.makedirs("./QIIME2/output_QIIME2_Notebook", exist_ok=True)
download_quantification(task, "./QIIME2/output_QIIME2_Notebook/quant.csv")
download_qiime2_manifest(task, "./QIIME2/output_QIIME2_Notebook/manifest.csv")
# Downloading metadata
workflow_fbmn.download_metadata(task, "./QIIME2/output_QIIME2_Notebook/unprocessed_metadata.tsv")

# Changing Metadata and Manifest Column name

In [8]:
#read metadata file
metadata = pd.read_csv("./QIIME2/output_QIIME2_Notebook/unprocessed_metadata.tsv", sep = "\t", index_col=False)
#rename 1st column to "#sample id
metadata = metadata.rename(columns={"filename":"sample id"})
#convert back to .tsv
metadata.to_csv('./QIIME2/output_QIIME2_Notebook/metadata.tsv', sep="\t", index=False)

In [9]:
# Disable warnings for cleaner output, comment out for debugging
warnings.filterwarnings('ignore')

# Blank Removal

In [11]:
# When cutoff is low, more noise (or background) detected; With higher cutoff, less background detected, thus more features observed
cutoff = 0.1

condition = 1

In [12]:
# Get folder with data files
result_dir = "./QIIME2/output_QIIME2_Notebook/"
#Read quant.csv and metadata .tsv
ft = pd.read_csv("./QIIME2/output_QIIME2_Notebook/quant.csv")
md = pd.read_csv("./QIIME2/output_QIIME2_Notebook/metadata.tsv", sep = "\t").set_index("sample id")



def inside_levels(df):
    # get all the columns (equals all attributes) -> will be number of rows
    levels = []
    types = []
    count = []
    for col in df.columns:
        types.append(type(df[col][0]))
        levels.append(sorted(set(df[col].dropna())))
        tmp = df[col].value_counts()
        count.append([tmp[levels[-1][i]] for i in range(len(levels[-1]))])
    return pd.DataFrame({"ATTRIBUTES": df.columns, "LEVELS": levels, "COUNT":count, "TYPES": types}, index=range(1, len(levels)+1))
new_md = md.copy() #storing the files under different names to preserve the original files
# remove the (front & tail) spaces, if any present, from the rownames of md
new_md.index = [name.strip() for name in md.index]
# for each col in new_md
# 1) removing the spaces (if any)
# 2) replace the spaces (in the middle) to underscore
# 3) converting them all to UPPERCASE
for col in new_md.columns:
    if new_md[col].dtype == str:
        new_md[col] = [item.strip().replace(" ", "_").upper() for item in new_md[col]]

new_ft = ft.copy() #storing the files under different names to preserve the original files
# changing the index in feature table to contain m/z and RT information
new_ft.index = [f"{id}_{round(mz, 3)}_{round(rt, 3)}" for id, mz, rt in zip(ft["row ID"], ft["row m/z"], ft["row retention time"])]
new_ft.index.name = "CustomIndex"
# drop all columns that are not mzML or mzXML file names
new_ft.drop(columns=[col for col in new_ft.columns if ".mz" not in col], inplace=True)
# remove " Peak area" from column names
new_ft.rename(columns={col: col.replace(" Peak area", "").strip() for col in new_ft.columns}, inplace=True)

if sorted(new_ft.columns) != sorted(new_md.index):
    # print the md rows / ft column which are not in ft columns / md rows and remove them
    ft_cols_not_in_md = [col for col in new_ft.columns if col not in new_md.index]
    new_ft.drop(columns=ft_cols_not_in_md, inplace=True)
    md_rows_not_in_ft = [row for row in new_md.index if row not in new_ft.columns]
    new_md.drop(md_rows_not_in_ft, inplace=True)

new_ft = new_ft.reindex(sorted(new_ft.columns), axis=1) #ordering the ft by its column names
new_md.sort_index(inplace=True) #ordering the md by its row names
list(new_ft.columns) == list(new_md.index)
data = new_md
df = pd.DataFrame({"LEVELS": inside_levels(data).iloc[condition-1]["LEVELS"]})
df.index = [*range(1, len(df)+1)]

display(df)

Unnamed: 0,LEVELS
1,Blank
2,Sample


In [13]:
#Among the shown levels of an attribute, select the one to remove
blank_id = 1

In [14]:

#Splitting the data into blanks and samples based on the metadata
md_blank = data[data[inside_levels(data)['ATTRIBUTES'][condition]] == df['LEVELS'][blank_id]]
blank = new_ft[list(md_blank.index)]
md_samples = data[data[inside_levels(data)['ATTRIBUTES'][condition]] != df['LEVELS'][blank_id]]
samples = new_ft[list(md_samples.index)]

blank_removal = samples.copy()

# Getting mean for every feature in blank and Samples
avg_blank = blank.mean(axis=1, skipna=False) # set skipna = False do not exclude NA/null values when computing the result.
avg_samples = samples.mean(axis=1, skipna=False)

# Getting the ratio of blank vs samples
ratio_blank_samples = (avg_blank+1)/(avg_samples+1)

# Create an array with boolean values: True (is a real feature, ratio<cutoff) / False (is a blank, background, noise feature, ratio>cutoff)
is_real_feature = (ratio_blank_samples<cutoff)
blank_removal = samples[is_real_feature.values]
imputation_samples = blank_removal.copy()

# save to file
entry_id = []
entry_mz = []
entry_time = []
for entryCol in blank_removal.index:
    entry = entryCol.split("_")
    entry_id.append(entry[0])
    entry_mz.append(entry[1])
    entry_time.append(entry[2])
blank_removal.insert(0,"#OTU ID",entry_id,True)
blank_removal.to_csv(os.path.join(result_dir, "Blanks_Removed.tsv"), sep = "\t", index = False)

# Imputation

In [15]:
# get the lowest intensity (that is not zero) as a cutoff LOD value
cutoff_LOD = round(imputation_samples.replace(0, np.nan).min(numeric_only=True).min())

imputation_samples = imputation_samples.apply(lambda x: [np.random.randint(1, cutoff_LOD) if v == 0 else v for v in x])
imputed = imputation_samples.copy()

entry_id = []
entry_mz = []
entry_time = []
for entryCol in imputed.index:
    entry = entryCol.split("_")
    entry_id.append(entry[0])
    entry_mz.append(entry[1])
    entry_time.append(entry[2])
imputed.insert(0,"#OTU ID",entry_id,True)
# save to file
imputed.to_csv(os.path.join(result_dir, "Imputed_QuantTable.tsv"), sep = "\t", index = False)

# Normalization

In [16]:
# Set normalization_method to 1 for sample centric normalization or 2 for Probabilistic Quotient Normalization
normalization_method = 1

In [17]:
normalized = imputation_samples.copy()
# Dividing each element of a particular column with its column sum
if normalization_method == 1:
    normalized = normalized.apply(lambda x: x/np.sum(x), axis=0)
else:
    normalized = PQN_normalization(normalized ,ref_norm = "median" , verbose=False) 
normalized_samples = normalized.copy()
entry_id = []
entry_mz = []
entry_time = []
for entryCol in normalized_samples.index:
    entry = entryCol.split("_")
    entry_id.append(entry[0])
    entry_mz.append(entry[1])
    entry_time.append(entry[2])
normalized_samples.insert(0,"#OTU ID",entry_id,True)
normalized_samples.to_csv(os.path.join(result_dir, "Normalised_Quant_table.tsv"), sep = "\t", index = False)

# Scaling

In [18]:
# transposing the imputed table before scaling
transposed = imputation_samples.T
# put the rows in the feature table and metadata in the same order
transposed.sort_index(inplace=True)
md_samples.sort_index(inplace=True)

if (md_samples.index == transposed.index).all():
    pass
else:
    print("WARNING: Sample names in feature and metadata table are NOT the same!")
transposed.to_csv(os.path.join(result_dir, "Imputed_QuantTable_transposed.csv"))

# scale filtered data
scaled = pd.DataFrame(StandardScaler().fit_transform(transposed), index=transposed.index, columns=transposed.columns)
scaled = scaled.T
entry_id = []
entry_mz = []
entry_time = []
for entryCol in scaled.index:
    entry = entryCol.split("_")
    entry_id.append(entry[0])
    entry_mz.append(entry[1])
    entry_time.append(entry[2])
scaled.insert(0,"#OTU ID",entry_id,True)
scaled.to_csv(os.path.join(result_dir, "Imputed_Scaled_QuantTable.tsv"), sep = "\t", index = False)


# Import Into Qiime2
## Convert .tsv to .biom


In [19]:
! biom convert \
  -i ./QIIME2/output_QIIME2_Notebook/Normalised_Quant_table.tsv \
  -o ./QIIME2/output_QIIME2_Notebook/quant.biom --to-hdf5

In [20]:
! qiime tools import \
  --input-path ./QIIME2/output_QIIME2_Notebook/quant.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path ./QIIME2/output_QIIME2_Notebook/qiime_table.qza

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
[32mImported ./QIIME2/output_QIIME2_Notebook/quant.biom as BIOMV210Format to ./QIIME2/output_QIIME2_Notebook/qiime_table.qza[0m
[0m

# Merging Metadata and Normalized Data 

In [21]:
transposed_scaled = scaled.transpose()

Data = pd.merge(md_samples, transposed_scaled, left_index=True, right_index=True, how="inner")
Data.index.name = 'sample_name'
Data.to_csv(os.path.join(result_dir, "merged_metadata.tsv"), sep = "\t", index = True)

# Longitudinal ANOVA

In [22]:
p_formula = 'ATTRIBUTE_Year~ATTRIBUTE_Sample_Area+ATTRIBUTE_Latitude'

In [23]:
! qiime longitudinal anova \
  --m-metadata-file ./QIIME2/output_QIIME2_Notebook/metadata.tsv \
  --p-formula $p_formula \
  --p-sstype 'I' \
  --o-visualization ./QIIME2/output_QIIME2_Notebook/metadata.qzv

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
[31m[1mPlugin error from longitudinal:

  Value must be a nonnegative integer or None

Debug info has been saved to /tmp/qiime2-q2cli-err-lz2f6k91.log[0m
[0m

In [24]:
Visualization.load('./QIIME2/output_QIIME2_Notebook/metadata.qzv')

ValueError: QIIME2/output_QIIME2_Notebook/metadata.qzv does not exist.

# Distance Matrix

In [25]:
p_metric = 'canberra_adkins'

In [26]:
! qiime diversity beta \
  --i-table ./QIIME2/output_QIIME2_Notebook/qiime_table.qza \
  --p-metric $p_metric \
  --o-distance-matrix ./QIIME2/output_QIIME2_Notebook/distance_matrix.qza

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
[32mSaved DistanceMatrix to: ./QIIME2/output_QIIME2_Notebook/distance_matrix.qza[0m
[0m

# Principal Coordinate Analysis (PCoA)

In [27]:
! qiime diversity pcoa \
  --i-distance-matrix ./QIIME2/output_QIIME2_Notebook/distance_matrix.qza \
  --o-pcoa ./QIIME2/output_QIIME2_Notebook/pcoa.qza

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
[32mSaved PCoAResults to: ./QIIME2/output_QIIME2_Notebook/pcoa.qza[0m
[0m

# Emperor plot

In [28]:
! qiime emperor plot \
  --i-pcoa ./QIIME2/output_QIIME2_Notebook/pcoa.qza \
  --m-metadata-file ./QIIME2/output_QIIME2_Notebook/metadata.tsv \
  --o-visualization ./QIIME2/output_QIIME2_Notebook/emperor_plot 


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
[32mSaved Visualization to: ./QIIME2/output_QIIME2_Notebook/emperor_plot.qzv[0m
[0m

# Visualization

In [29]:
Visualization.load('./QIIME2/output_QIIME2_Notebook/emperor_plot.qzv')

# Classifier Data/Heat Map

In [30]:
metadata_column = 'ATTRIBUTE_Sample_Area'
estimator = 'RandomForestClassifier'
n_estimators = 500
random_state = 123


In [None]:
! qiime sample-classifier classify-samples \
  --i-table ./QIIME2/output_QIIME2_Notebook/qiime_table.qza \
  --m-metadata-file ./QIIME2/output_QIIME2_Notebook/metadata.tsv \
  --m-metadata-column $metadata_column \
  --p-optimize-feature-selection \
  --p-parameter-tuning \
  --p-estimator $estimator \
  --p-n-estimators $n_estimators \
  --p-random-state $random_state \
  --o-accuracy-results ./QIIME2/output_QIIME2_Notebook/accuracy_results.qzv \
  --o-feature-importance ./QIIME2/output_QIIME2_Notebook/feature_importance.qza \
  --o-heatmap ./QIIME2/output_QIIME2_Notebook/heatmap.qzv \
  --o-model-summary ./QIIME2/output_QIIME2_Notebook/model_summary.qzv \
  --o-predictions ./QIIME2/output_QIIME2_Notebook/predictions.qza \
  --o-probabilities ./QIIME2/output_QIIME2_Notebook/probabilities.qza \
  --o-sample-estimator ./QIIME2/output_QIIME2_Notebook/sample_estimator.qza \
  --o-test-targets ./QIIME2/output_QIIME2_Notebook/test_targets.qza \
  --o-training-targets ./QIIME2/output_QIIME2_Notebook/training_targets.qza 


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


# Visualization

In [None]:
Visualization.load('./QIIME2/output_QIIME2_Notebook/heatmap.qzv')

# PermANOVA

In [None]:
metadata_column_permanova = 'ATTRIBUTE_Sample_Area'

In [None]:
! qiime diversity beta-group-significance \
  --i-distance-matrix ./QIIME2/output_QIIME2_Notebook/distance_matrix.qza \
  --m-metadata-file ./output_QIIME2_Notebook/metadata.tsv \
  --m-metadata-column $metadata_column_permanova \
  --o-visualization ./QIIME2/output_QIIME2_Notebook/permanova.qzv

# Visualization

In [None]:
Visualization.load('./QIIME2/output_QIIME2_Notebook/permanova.qzv')