In [None]:
# Soil_parameter_visualization.py
# Visualization of soil chemical and organic parameters across nursery locations
# Includes Spearman correlation matrix among soil parameters (aggregated by location)

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import seaborn as sns
import numpy as np

# === 1. Load data ===
file_path = r"data/TO_BE_FILLED_soil_subset.csv"
df = pd.read_csv(file_path, sep=";", dtype=str)

# === 2. Clean column names ===
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace("/%", "_percent").str.replace("/kg", "_mg_per_kg")
df.rename(columns={
    "P_mg__mg_per_kg": "P_mg_per_kg",
    "K_mg__mg_per_kg": "K_mg_per_kg",
    "Ca_mg_mg_per_kg": "Ca_mg_per_kg",
    "Mg_mg_mg_per_kg": "Mg_mg_per_kg"
}, inplace=True)

# === 3. Drop unnecessary columns if defined ===
drop_cols = []  # define if needed
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# === 4. Convert numeric columns ===
for col in df.columns:
    if col not in ["Location", "Site"]:
        df[col] = df[col].astype(str).apply(lambda x: x.replace(",", ".") if isinstance(x, str) else x)
        df[col] = pd.to_numeric(df[col], errors="coerce")

# === 5. Define optimal ranges for basic nutrients ===
optimal_values = {
    "pH_CaCl2": (5.5, 6.5),
    "P_mg_per_kg": (80, 150),
    "K_mg_per_kg": (100, 200),
    "Ca_mg_per_kg": (3000, 6000),
    "Mg_mg_per_kg": (150, 400)
}

# === 6. Color mapping using normalized viridis ===
unique_locations = df["Location"].unique()
norm = mcolors.Normalize(vmin=0, vmax=len(unique_locations)-1)
colors = [cm.viridis(norm(i)) for i in range(len(unique_locations))]
location_color_map = dict(zip(unique_locations, colors))

# === 7. Plot basic nutrient parameters with min/max ===
for param, (opt_min, opt_max) in optimal_values.items():
    if param in df.columns:
        plt.figure(figsize=(10, 6))
        sorted_df = df.sort_values("Location")
        bar_colors = [location_color_map[loc] for loc in sorted_df["Location"]]
        
        plt.bar(sorted_df["Location"], sorted_df[param], color=bar_colors)
        plt.axhline(opt_min, color='red', linestyle='--', label="Minimum")
        plt.axhline(opt_max, color='green', linestyle='--', label="Maximum")
        
        plt.title(f"{param} across locations", fontsize=14)
        plt.xlabel("Location", fontsize=12)
        plt.ylabel(param.replace("_", " "), fontsize=12)
        plt.xticks(rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.show()

# === 8. Plot organic compartments (no min/max) ===
organic_params = [
    "Cox_percent", "Humus_percent", "HL_percent", "HK_percent",
    "FK_percent", "HK_FK", "SH_percent", "Q4_6"
]

for param in organic_params:
    if param in df.columns:
        plt.figure(figsize=(10, 6))
        sorted_df = df.sort_values("Location")
        bar_colors = [location_color_map[loc] for loc in sorted_df["Location"]]
        
        plt.bar(sorted_df["Location"], sorted_df[param], color=bar_colors)
        
        plt.title(f"{param} across locations", fontsize=14)
        plt.xlabel("Location", fontsize=12)
        plt.ylabel(param.replace("_", " "), fontsize=12)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# === 9. Spearman correlation matrix ===
# Aggregate values per Location (averaging duplicates)
num_cols = [col for col in df.columns if col not in ['Location', 'Site']]
df_avg = df.groupby("Location", as_index=False)[num_cols].mean()

# Compute correlation matrix
corr = df_avg[num_cols].corr(method='spearman')

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Spearman correlation among soil parameters (aggregated by location)")
plt.tight_layout()
plt.show()


In [None]:
# === Permutation-based Spearman correlation between fungal Shannon diversity and soil parameters ===

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

# === Load soil data ===
file_merged = r"data/TO_BE_FILLED_soil_subset.csv"
df = pd.read_csv(file_merged, sep=';', decimal=',')

# === Clean and convert numeric columns ===
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace("/", "_").str.replace("%", "percent")
num_cols = [col for col in df.columns if col not in ['Location', 'Site']]
for col in num_cols:
    df[col] = df[col].astype(str).str.replace(',', '.').astype(float)

# === Aggregate soil parameters by Location ===
df_avg = df.groupby("Location", as_index=False)[num_cols].mean()

# === Load fungal Shannon diversity ===
fungi_file = r"data/TO_BE_FILLED_diversity_indices_subset.csv"
df_fungi = pd.read_csv(fungi_file, sep=';')
if 'Location' not in df_fungi.columns:
    df_fungi.rename(columns={df_fungi.columns[0]: 'Location'}, inplace=True)
df_fungi_avg = df_fungi.groupby('Location', as_index=False)['Shannon'].mean()

# === Merge soil and diversity data ===
df_final = pd.merge(df_avg, df_fungi_avg, on='Location', how='inner')

# === Define permutation-based Spearman correlation function ===
def perm_spearman(x, y, n_perm=10000, seed=42):
    np.random.seed(seed)
    obs_rho, _ = spearmanr(x, y)
    count = sum(abs(spearmanr(x, np.random.permutation(y))[0]) >= abs(obs_rho) for _ in range(n_perm))
    return obs_rho, count / n_perm

# === Plot correlations ===
final_num_cols = [col for col in df_final.columns if col not in ['Location', 'Shannon']]
plt.figure(figsize=(16, 12))
for i, col in enumerate(final_num_cols):
    rho, p_perm = perm_spearman(df_final[col], df_final['Shannon'])
    plt.subplot(4, 4, i+1)
    sns.regplot(x=col, y='Shannon', data=df_final, scatter_kws={'s': 40})
    plt.title(f"{col}\nρ = {rho:.2f}, p_perm = {p_perm:.4f}")
    plt.xlabel(col)
    plt.ylabel("Shannon (Fungi)")
plt.tight_layout()
plt.suptitle("Permutation-based Spearman correlation:\nFungal Shannon diversity vs. Soil parameters", y=1.03, fontsize=15)
plt.show()


In [None]:
# === envfit-like Spearman vectors on Bray-Curtis PCoA ===

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
from scipy.stats import spearmanr
import matplotlib.patheffects as PathEffects

# === Load OTU table and calculate relative abundances ===
otu = pd.read_csv("data/TO_BE_FILLED_fungi_OTU_table_subset.csv", sep=';')
otu_counts = otu.iloc[:, 6:].apply(pd.to_numeric, errors='coerce')
otu_table = otu_counts.T
otu_table.index.name = "SampleID"
otu_rel = otu_table.div(otu_table.sum(axis=1), axis=0).fillna(0)

# === Bray-Curtis PCoA ===
bc = beta_diversity("braycurtis", otu_rel.values, ids=otu_rel.index)
pcoa_res = pcoa(bc)
pcoa_df = pcoa_res.samples.iloc[:, :2].copy()
pcoa_df.columns = ["PC1", "PC2"]
pcoa_df["SampleID"] = pcoa_df.index

# === Load metadata ===
div = pd.read_csv("data/TO_BE_FILLED_diversity_indices_subset.csv", sep=';')[['Site', 'Shannon', 'Location']]
div.columns = ['SampleID', 'Shannon', 'Location']
soil = pd.read_csv("data/TO_BE_FILLED_soil_subset.csv", sep=';', decimal=',')
soil.columns = soil.columns.str.strip().str.replace(" ", "_").str.replace("%", "percent").str.replace("(", "").str.replace(")", "").str.replace("/", "_").str.replace("__", "_")

# === Merge metadata with ordination results ===
meta = pd.merge(div, soil, on='Location')
merged = pd.merge(pcoa_df, meta, on='SampleID')

# === Define groups of soil variables ===
anorg_vars = ['pH_CaCl2', 'P_mg_kg', 'K_mg_kg', 'Ca_mg_kg', 'Mg_mg_kg']
org_vars = ['Cox_percent', 'Humus_percent', 'HL_percent', 'HK_percent',
            'FK_percent', 'HK_FK', 'SH_percent', 'Q4_6']

# === Function to draw vectors over PCoA ===
def plot_envfit_pcoa(merged, vars_list, title, vector_color):
    vectors = []
    for var in vars_list:
        if var in merged.columns:
            x_corr = spearmanr(merged[var], merged["PC1"])[0]
            y_corr = spearmanr(merged[var], merged["PC2"])[0]
            length = np.sqrt(x_corr**2 + y_corr**2)
            vectors.append((x_corr, y_corr, var, length))

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(merged["PC1"], merged["PC2"], c=merged["Shannon"], cmap="viridis", s=100, edgecolor='black')

    # Add sample labels
    for i, row in merged.iterrows():
        ax.text(row["PC1"] + 0.01, row["PC2"], row["SampleID"], fontsize=8,
                path_effects=[PathEffects.withStroke(linewidth=3, foreground='white')])

    # Add environmental vectors
    for x, y, label, _ in vectors:
        ax.arrow(0, 0, x, y, color=vector_color, alpha=0.7, head_width=0.02)
        ax.text(x * 1.2, y * 1.2, label, fontsize=9, ha='center', va='center',
                path_effects=[PathEffects.withStroke(linewidth=3, foreground='white')])

    # Legend text box
    vector_legend_labels = [f"{label}: {length:.2f}" for _, _, label, length in vectors]
    legend_text = f"Vector length (Spearman ρ)\n{vector_color} = {title.lower()}\n" + "\n".join(vector_legend_labels)
    props = dict(boxstyle='round', facecolor='white', alpha=0.8)
    ax.text(0.01, 0.01, legend_text, transform=ax.transAxes, fontsize=9,
            verticalalignment='bottom', horizontalalignment='left', bbox=props)

    # Add colorbar and axis lines
    cbar = plt.colorbar(scatter)
    cbar.set_label("Shannon index")
    ax.axhline(0, linestyle='--', color='gray')
    ax.axvline(0, linestyle='--', color='gray')

    # Titles and axes
    ax.set_title(f"{title} + Shannon Index (PCoA Bray-Curtis)")
    ax.set_xlabel(f"PC1 ({pcoa_res.proportion_explained[0]*100:.1f}%)")
    ax.set_ylabel(f"PC2 ({pcoa_res.proportion_explained[1]*100:.1f}%)")
    plt.tight_layout()
    plt.show()

# === Generate plots ===
plot_envfit_pcoa(merged, anorg_vars, title="Inorganic Soil Properties", vector_color='gray')
plot_envfit_pcoa(merged, org_vars, title="Organic Soil Properties", vector_color='blue')


In [None]:
# Trophic group vs. soil parameter Spearman correlation analysis

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# === Load data ===
data = pd.read_csv("data/TO_BE_FILLED_troficgroup_soil_subset.csv", sep=';')
data.columns = data.columns.str.strip()
data = data.loc[:, ~data.columns.duplicated()]

# === Define columns ===
trophic_cols = [
    'Pathotroph', 'Pathotroph-Pathotroph-Saprotroph', 'Pathotroph-Pathotroph-Saprotroph-Symbiotroph', 
    'Pathotroph-Saprotroph', 'Pathotroph-Saprotroph-Symbiotroph', 'Pathotroph-Symbiotroph', 
    'Saprotroph', 'Saprotroph-Saprotroph-Symbiotroph', 'Saprotroph-Symbiotroph', 'Symbiotroph'
]

soil_cols = [
    'Cox (%)', 'Humus (%)', 'HL (%)', 'HK (%)', 'FK (%)', 'HK/FK', 'SH (%)',
    'Q4/6', 'pH CaCl2', 'P mg /kg', 'K mg /kg', 'Ca mg/kg', 'Mg mg/kg'
]

# === Aggregate by location ===
troph_means = data.groupby("Location")[trophic_cols].mean()
soil_means = data.groupby("Location")[soil_cols].first()

# === Convert to numeric (handle commas)
for col in soil_means.columns:
    soil_means[col] = pd.to_numeric(soil_means[col].astype(str).str.replace(',', '.'), errors='coerce')

# === Merge into single dataframe ===
merged = pd.concat([troph_means, soil_means], axis=1)

# === Compute Spearman correlations ===
results = []

for troph in troph_means.columns:
    for soil in soil_means.columns:
        x = merged[troph]
        y = merged[soil]

        if x.isnull().any() or y.isnull().any():
            rho, pval = np.nan, np.nan
        else:
            rho, pval = spearmanr(x, y)
        
        results.append({
            "Trophic group": troph,
            "Soil parameter": soil,
            "Spearman r": round(rho, 2),
            "p-value": round(pval, 4)
        })

df_corr = pd.DataFrame(results)

# === Save full results
df_corr.to_csv("results/spearman_trophic_soil_full.csv", sep=';', index=False)

# === Extract significant results (p < 0.05)
df_sig = df_corr[df_corr["p-value"] < 0.05]
df_sig.to_csv("results/spearman_trophic_soil_significant.csv", sep=';', index=False)

# === Optional: Plot scatterplots for significant correlations
for _, row in df_sig.iterrows():
    plt.figure()
    sns.regplot(
        x=merged[row["Soil parameter"]],
        y=merged[row["Trophic group"]],
        scatter_kws={'s': 60},
        line_kws={'color': 'red'},
        ci=None
    )
    plt.title(f"{row['Trophic group']} vs {row['Soil parameter']}\n"
              f"ρ = {row['Spearman r']}, p = {row['p-value']}")
    plt.xlabel(row["Soil parameter"])
    plt.ylabel(row["Trophic group"])
    plt.tight_layout()
    plt.show()


In [None]:
# PCoA and envfit of fungal trophic groups with organic soil parameters

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa
import seaborn as sns
import os

# === Input and output paths ===
input_file = "data/Trofic_mode_fixed.csv"
output_dir = "results"
os.makedirs(output_dir, exist_ok=True)

# === Columns ===
trophic_cols = [
    'Pathotroph', 'Pathotroph-Pathotroph-Saprotroph', 'Pathotroph-Pathotroph-Saprotroph-Symbiotroph',
    'Pathotroph-Saprotroph', 'Pathotroph-Saprotroph-Symbiotroph', 'Pathotroph-Symbiotroph',
    'Saprotroph', 'Saprotroph-Saprotroph-Symbiotroph', 'Saprotroph-Symbiotroph',
    'Symbiotroph', 'Unassigned'
]

# === Organic soil parameters ===
soil_params = [
    'Cox (%)', 'Humus (%)', 'HL (%)', 'HK (%)', 'FK (%)', 'HK/FK', 'SH (%)', 'Q4/6'
]

# === Load and filter data ===
df = pd.read_csv(input_file, sep=';')
abundance_data = df[trophic_cols].fillna(0)
abundance_data = abundance_data[abundance_data.sum(axis=1) > 0]
filtered_df = df.loc[abundance_data.index].reset_index(drop=True)
abundance_data = abundance_data.reset_index(drop=True)

# === Hellinger transformation ===
def hellinger_transform(df):
    row_sums = df.sum(axis=1).to_numpy()
    transformed = np.sqrt(df.to_numpy() / row_sums[:, None])
    return pd.DataFrame(transformed, columns=df.columns, index=df.index)

hellinger = hellinger_transform(abundance_data)

# === Bray-Curtis PCoA ===
sample_ids = [f"sample_{i}" for i in range(len(filtered_df))]
distance_matrix = beta_diversity('braycurtis', hellinger, ids=sample_ids)
pcoa_results = pcoa(distance_matrix)
pcoa_df = pcoa_results.samples
pcoa_df['Location'] = filtered_df['Location'].values

# === Envfit (Pearson correlation) ===
envfit_vectors = {}
print("\n📊 Pearson correlation of soil parameters with PCoA axes:")
print(f"{'Parameter':25} | {'|r| vector length':>20}")
print("-" * 50)

for param in soil_params:
    x = pd.to_numeric(filtered_df[param], errors='coerce').values
    mask = ~np.isnan(x)
    x = x[mask]
    pc1 = pcoa_df['PC1'].values[mask]
    pc2 = pcoa_df['PC2'].values[mask]
    if len(x) > 2:
        r1, _ = pearsonr(x, pc1)
        r2, _ = pearsonr(x, pc2)
        strength = np.linalg.norm([r1, r2])
        envfit_vectors[param] = (r1, r2)
        print(f"{param:25} | {strength:20.3f}")
    else:
        envfit_vectors[param] = (0.0, 0.0)

# === Select top parameters for visualization (|r| ≥ 0.05) ===
param_strengths = {param: np.linalg.norm(v) for param, v in envfit_vectors.items()}
sorted_params = sorted(param_strengths.items(), key=lambda x: x[1], reverse=True)
top_params = [param for param, strength in sorted_params if strength >= 0.05]

print("\nTop parameters for visualization:")
for i, param in enumerate(top_params, 1):
    print(f"{i}. {param}")

# === Plot PCoA with envfit arrows ===
def plot_envfit(pcoa_df, envfit_vectors, top_params, title, file_name):
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.scatterplot(data=pcoa_df, x='PC1', y='PC2', hue='Location', palette='Set2', s=80, ax=ax)

    arrow_scale = 0.5
    legend_text_lines = []

    for param in top_params:
        x, y = envfit_vectors[param]
        strength = np.linalg.norm([x, y])
        if strength > 0:
            dx, dy = x * arrow_scale, y * arrow_scale
            ax.arrow(0, 0, dx, dy, color='gray', alpha=0.8,
                     head_width=0.01, length_includes_head=True)

            label_x = dx * 0.9
            label_y = dy * 0.9
            ha = 'right' if dx < 0 else 'left'
            va = 'bottom' if dy >= 0 else 'top'

            ax.text(label_x, label_y, param, fontsize=10, ha=ha, va=va,
                    bbox=dict(boxstyle='round,pad=0.2', fc='white', alpha=0.7, lw=0))

            legend_text_lines.append(f"{param}: |r| = {strength:.3f}")

    ax.axhline(0, linestyle='dotted', color='gray')
    ax.axvline(0, linestyle='dotted', color='gray')
    total_var = pcoa_results.proportion_explained['PC1'] + pcoa_results.proportion_explained['PC2']
    ax.set_title(f"{title}\n(PC1+PC2 = {total_var*100:.2f} % variance explained)", fontsize=13)
    ax.set_xlabel(f"PC1 ({pcoa_results.proportion_explained['PC1']*100:.2f} %)")
    ax.set_ylabel(f"PC2 ({pcoa_results.proportion_explained['PC2']*100:.2f} %)")

    ax.legend(title='Location', bbox_to_anchor=(1.18, 1), loc='upper left', borderaxespad=0.)

    textstr = "\n".join(legend_text_lines)
    props = dict(boxstyle='round', facecolor='white', alpha=0.9)
    ax.text(1.18, 0.05, textstr, transform=ax.transAxes, fontsize=9,
            verticalalignment='bottom', bbox=props)

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, file_name), dpi=300)
    plt.show()

# === Run plotting ===
plot_envfit(
    pcoa_df,
    envfit_vectors,
    top_params,
    "PCoA of Fungal Trophic Groups + Envfit of Organic Soil Parameters",
    "PCoA_envfit_organic_soil.png"
)
