In [None]:
# FUNGuild annotation and visualization pipeline
# Author: Lucie Frejlichová


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import os

# === 1. Download FUNGuild database ===
url = "http://www.stbates.org/funguild_db_2.php"
response = requests.get(url)
data = response.content.decode("utf-8").split("\n")[6].strip("</body>")
funguild_db = json.loads(data)

# === 2. Create dictionary of genera and their trophic modes ===
funguild_dict = {}
for record in funguild_db:
    genus = record["taxon"].strip()
    funguild_dict[genus] = {
        "Trophic Mode": record.get("trophicMode", "Unassigned"),
        "Citation": record.get("citationSource", "Unknown")
    }

# === 3. Load Genus-level filtered input file ===
# Replace with actual filename later:
input_path = "data/TO_BE_FILLED_Funguild_subset.csv"
df = pd.read_csv(input_path, sep="\t", dtype=str)

# === 4. Clean taxonomy column ===
df["taxonomy"] = df["taxonomy"].fillna("unidentified")
df["taxonomy"] = df["taxonomy"].replace(r'^\s*$', 'unidentified', regex=True)

# === 5. Assign Genus, Trophic Mode, and Citation ===
def get_taxon_info(taxon):
    genus = taxon.split(",")[0].strip()
    if genus.lower() == "unidentified":
        return {"Genus": "unidentified", "Trophic Mode": "Unassigned", "Citation": "Unknown"}
    info = funguild_dict.get(genus, {"Trophic Mode": "Unassigned", "Citation": "Unknown"})
    return {"Genus": genus, "Trophic Mode": info["Trophic Mode"], "Citation": info["Citation"]}

records = df["taxonomy"].apply(get_taxon_info)
result_df = pd.DataFrame(records.tolist())
output_path = "data/TO_BE_FILLED_fungi_funguild_trophicmode_subset.csv"
result_df.to_csv(output_path, index=False, sep=",")

# === 6. Merge trophic annotations with OTU abundance table ===
trophic_path = "data/TO_BE_FILLED_fungi_funguild_trophicmode_subset.csv"
abundance_path = "data/TO_BE_FILLED_Funguild_OTU_subset.csv"

trophic_df = pd.read_csv(trophic_path, sep=",", encoding="utf-8")
abundance_df = pd.read_csv(abundance_path, sep=";", encoding="utf-8")

if len(trophic_df) != len(abundance_df):
    raise ValueError("Mismatch in number of OTUs between trophic and abundance files!")

trophic_df["OTU"] = abundance_df["OTU"].values
merged_df = pd.merge(trophic_df, abundance_df, on="OTU", how="right")
merged_output = "data/TO_BE_FILLED_fungi_funguild_annotated_subset.csv"
merged_df.to_csv(merged_output, index=False, sep=";")

# === 7. Calculate relative abundance per sample ===
abundance_cols = merged_df.columns[4:]
merged_df[abundance_cols] = merged_df[abundance_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

df_rel = merged_df.copy()
for col in abundance_cols:
    col_sum = merged_df[col].sum()
    df_rel[col] = (merged_df[col] / col_sum * 100) if col_sum > 0 else 0

df_grouped = df_rel.groupby("Trophic Mode")[abundance_cols].sum().reset_index()
rel_output = "data/TO_BE_FILLED_fungi_FUNguild_relativeabundace.csv"
df_grouped.to_csv(rel_output, index=False, sep=";")

# === 8. Visualization ===
df_plot = pd.read_csv(rel_output, sep=";")
location_mapping = {
    'N1B': ('N1', 'Base'), 'N1S': ('N1', 'Stem'),
    'N2B': ('N2', 'Base'), 'N2S': ('N2', 'Stem'),
    'N3B': ('N3', 'Base'), 'N3S': ('N3', 'Stem'),
    'N4B': ('N4', 'Base'), 'N4S': ('N4', 'Stem'),
    'N5B': ('N5', 'Base'), 'N5S': ('N5', 'Stem'),
    'N6B': ('N6', 'Base'), 'N6S': ('N6', 'Stem'),
    'N7B': ('N7', 'Base'), 'N7S': ('N7', 'Stem')
}

df_long = df_plot.melt(id_vars=["Trophic Mode"], var_name="Code", value_name="RelAbund")
df_long["Location"] = df_long["Code"].map(lambda x: location_mapping.get(x, ("Unknown", "Unknown"))[0])
df_long["Part"] = df_long["Code"].map(lambda x: location_mapping.get(x, ("Unknown", "Unknown"))[1])

trophic_totals = df_long.groupby("Trophic Mode")["RelAbund"].sum()
total_all = trophic_totals.sum()
trophic_percent = (trophic_totals / total_all * 100).sort_values(ascending=False)

main_colors = sns.color_palette("Paired", 6) + sns.color_palette("Set2", 6)
extended_colors = main_colors + sns.color_palette("muted", 10)
color_map = dict(zip(trophic_percent.index, extended_colors[:len(trophic_percent)]))
trophic_labels = [f"{mode} ({trophic_percent[mode]:.1f}%)" for mode in trophic_percent.index]

# === Plot 1: By Sample Code ===
df_site = df_long.pivot_table(index="Code", columns="Trophic Mode", values="RelAbund").fillna(0)
df_site_norm = df_site.div(df_site.sum(axis=1), axis=0) * 100

ax1 = df_site_norm.plot(kind='bar', stacked=True, figsize=(14, 6),
                        color=[color_map[col] for col in df_site.columns])
ax1.set_ylabel("Relative Abundance [%]")
ax1.set_title("Normalized Relative Abundance by Sampling Site (HTS)")
ax1.legend(trophic_labels, title="Trophic Mode", bbox_to_anchor=(1.05, 1), loc='upper left')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.savefig("figures/stacked_by_sampling_site_envmicro.png", dpi=300)
plt.show()

# === Plot 2: By Location ===
df_loc = df_long.groupby(["Location", "Trophic Mode"])["RelAbund"].sum().reset_index()
df_loc_pivot = df_loc.pivot_table(index="Location", columns="Trophic Mode", values="RelAbund").fillna(0)
df_loc_norm = df_loc_pivot.div(df_loc_pivot.sum(axis=1), axis=0) * 100

ax2 = df_loc_norm.plot(kind='bar', stacked=True, figsize=(14, 6),
                       color=[color_map[col] for col in df_loc_pivot.columns])
ax2.set_ylabel("Relative Abundance [%]")
ax2.set_title("Normalized Relative Abundance by Location (HTS)")
ax2.legend(trophic_labels, title="Trophic Mode", bbox_to_anchor=(1.05, 1), loc='upper left')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.savefig("figures/stacked_by_location_envmicro.png", dpi=300)
plt.show()

In [None]:
# === Trophic group Spearman correlation matrix ===
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import numpy as np

# === Load data ===
file_path = r'data/TO_BE_FILLED_fungi_FUNguild_relativeabundace.csv'
data = pd.read_csv(file_path, sep=';')
data.columns = data.columns.str.strip()
data = data.loc[:, ~data.columns.duplicated()]

# === Define trophic columns ===
trophic_cols = [
    'Pathotroph', 'Pathotroph-Pathotroph-Saprotroph', 'Pathotroph-Pathotroph-Saprotroph-Symbiotroph', 
    'Pathotroph-Saprotroph', 'Pathotroph-Saprotroph-Symbiotroph', 'Pathotroph-Symbiotroph', 
    'Saprotroph', 'Saprotroph-Saprotroph-Symbiotroph', 'Saprotroph-Symbiotroph', 'Symbiotroph'
]

# === Aggregate by Location ===
df_troph = data.groupby('Location')[trophic_cols].mean()

# === Compute correlation and p-value matrices ===
corr_matrix = pd.DataFrame(index=trophic_cols, columns=trophic_cols)
pval_matrix = pd.DataFrame(index=trophic_cols, columns=trophic_cols)

for i in trophic_cols:
    for j in trophic_cols:
        rho, p = spearmanr(df_troph[i], df_troph[j])
        corr_matrix.loc[i, j] = rho
        pval_matrix.loc[i, j] = p

# Convert to float
corr_matrix = corr_matrix.astype(float)
pval_matrix = pval_matrix.astype(float)

# === Plot correlation heatmap ===
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", center=0, vmin=-1, vmax=1, fmt=".2f")
plt.title("Spearman correlation among trophic groups")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# === Plot p-value heatmap (mask non-significant) ===
mask = pval_matrix >= 0.05
plt.figure(figsize=(10, 8))
sns.heatmap(pval_matrix, annot=True, cmap="YlOrRd_r", mask=mask, fmt=".3f", cbar=False)
plt.title("Significant Spearman p-values (p < 0.05) among trophic groups")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
