In [None]:
module load singularity
singularity pull docker://quay.io/qiime2/amplicon:2024.5

wget https://data.qiime2.org/classifiers/sklearn-1.4.2/silva/silva-138-99-nb-classifier.qza

wget https://github.com/picrust/picrust2/archive/v2.6.2.tar.gz
tar xzf  v2.6.2.tar.gz; rm -f *.gz
cd picrust2-2.6.2
conda env create -f picrust2-env.yaml; conda activate picrust2
pip install --editable .
export TMPDIR=${HOME}/tmp
pytest


In [None]:
#!/bin/bash
#PJM -L rscunit=cx
#PJM -L rscgrp=cx-share
#PJM -L elapse=168:00:00
#PJM -L jobenv=singularity

module load singularity

ls data/* \
| awk -v P="${PWD}" 'BEGIN { print "sample-id,absolute-filepath,direction" } \
    { \
        orig=$0; \
        gsub(/data\//, "", $1); \
        gsub(/_R[12].fastq.gz/, "", $1); \
        if(NR%2==0) { print $1 "," P "/" orig ",reverse" } \
        else { print $1 "," P "/" orig ",forward" } \
    }' \
> ManifestFile.csv

# データ取込
if [ ! -d ${PWD}/tmp ]; then mkdir -p ${PWD}/tmp; fi
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime tools import \
        --type 'SampleData[PairedEndSequencesWithQuality]' \
        --input-path ManifestFile.csv \
        --output-path tmp/demux.qza \
        --input-format PairedEndFastqManifestPhred33

# 可視化
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime demux summarize   \
        --i-data tmp/demux.qza   \
        --o-visualization tmp/demux.qzv

# PCRエラー修正
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime dada2 denoise-paired \
        --i-demultiplexed-seqs tmp/demux.qza \
        --p-trunc-len-f 250 \
        --p-trunc-len-r 250 \
        --p-n-threads 0 \
        --o-table tmp/table.qza \
        --o-denoising-stats tmp/stats.qza \
        --o-representative-sequences tmp/rep-seqs.qza \
        --verbose

# 分類
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime feature-classifier classify-sklearn \
        --i-classifier ${HOME}/tool/silva-138-99-nb-classifier.qza \
        --i-reads tmp/rep-seqs.qza \
        --o-classification tmp/taxonomy.qza

# 低リード数除去
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime feature-table filter-samples  \
        --i-table tmp/table.qza  \
        --p-min-frequency 1500 \
        --o-filtered-table tmp/selected_table.qza
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime feature-table filter-features  \
        --i-table tmp/selected_table.qza  \
        --p-min-frequency 1 \
        --o-filtered-table tmp/selected_table2.qza

# 属レベルで統合
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime taxa collapse \
        --i-table tmp/selected_table2.qza \
        --i-taxonomy tmp/taxonomy.qza \
        --p-level 6 \
        --o-collapsed-table tmp/collapsed_table.qza

# 補正
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime feature-table relative-frequency \
        --i-table tmp/collapsed_table.qza \
        --o-relative-frequency-table tmp/relfreq_table.qza

# 出力
if [ ! -d ${PWD}/Result ]; then mkdir -p ${PWD}/Result; fi
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime tools export \
        --input-path tmp/relfreq_table.qza \
        --output-path tmp
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    biom convert \
        -i tmp/feature-table.biom \
        -o Result/table.tsv \
        --to-tsv

# picrust2
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime tools export \
        --input-path tmp/selected_table2.qza \
        --output-path tmp
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    biom convert \
        -i tmp/feature-table.biom \
        -o tmp/feature-table.tsv \
        --to-tsv
sed -i '1d' tmp/feature-table.tsv
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/tool/amplicon_2024.5.sif \
    qiime tools export \
        --input-path tmp/rep-seqs.qza \
        --output-path tmp
singularity exec \
    --bind $PWD:$PWD \
    ${HOME}/amplicon_2024.5.sif \
    qiime tools export \
        --input-path tmp/taxonomy.qza \
        --output-path tmp
awk '$2 ~ /g__Faecalibacterium/ {print $1}' tmp/taxonomy.tsv > tmp/target_ids.txt
awk '$2 ~ /g__Fusicatenibacter/ {print $1}' tmp/taxonomy.tsv > tmp/target_ids.txt
awk '$2 ~ /g__\[Eubacterium\]_coprostanoligenes_group/ {print $1}' tmp/taxonomy.tsv > tmp/target_ids.txt
grep -A 1 -f tmp/target_ids.txt tmp/dna-sequences.fasta | grep -v "^--" > tmp/selected.fasta
head -n 1 tmp/feature-table.tsv > tmp/selected.tsv
grep -F -f tmp/target_ids.txt tmp/feature-table.tsv >> tmp/selected.tsv

picrust2_pipeline.py \
    -s tmp/selected.fasta \
    -i tmp/selected.tsv \
    -o Result/Eubacterium_coprostanoligenes_group \
    -p 16

gunzip Result/Faecalibacterium/KO_metagenome_out/pred_metagenome_unstrat.tsv.gz
gunzip Result/Fusicatenibacter/KO_metagenome_out/pred_metagenome_unstrat.tsv.gz
gunzip Result/Eubacterium_coprostanoligenes_group/KO_metagenome_out/pred_metagenome_unstrat.tsv.gz
mv Result/Faecalibacterium/KO_metagenome_out/pred_metagenome_unstrat.tsv Result/Faecalibacterium/KO_metagenome_out/Faecalibacterium.tsv
mv Result/Fusicatenibacter/KO_metagenome_out/pred_metagenome_unstrat.tsv Result/Fusicatenibacter/KO_metagenome_out/Fusicatenibacter.tsv
mv Result/Eubacterium_coprostanoligenes_group/KO_metagenome_out/pred_metagenome_unstrat.tsv Result/Eubacterium_coprostanoligenes_group/KO_metagenome_out/Eubacterium_coprostanoligenes_group.tsv


In [None]:
using CSV, DataFrames
using JLD2
using SGCRNAs


Data = CSV.read("Result/table.tsv", header=2, comment="", delim='\t', DataFrame);
rename!(Data, 1=>:Taxon);
# 属レベルでの分類データのみ取得
Data = Data[occursin.("g__",Data.Taxon) .& .!occursin.("g__uncultured",Data.Taxon) .& .!occursin.("g__Incertae",Data.Taxon), :];
insertcols!(Data, 2, :Genus => string.(getindex.(split.(Data.Taxon, ";g__"), 2)));

CorData, GradData = CGM(Data.Genus, Matrix(Data[:,3:end]), fn="Result/");
# CorData = load_object("Result/_cor.jld2");  GradData = load_object("Result/_grad.jld2");
Q = map(x -> sum(x)!=0.0, eachcol(CorData))
CorData = CorData[Q,Q]
GradData = GradData[Q,Q]

# spectral clustering
clust, pos, edge_data = SpectralClustering(CorData, GradData, tNodeNum=5, pcas=3, nNeighbors=1);
save_object("Result/_scdata.jld2", (clust, pos, edge_data));
# clust, pos, edge_data = load_object("Result/_scdata.jld2");

# set parameter
for i in 1:length(clust) println(maximum(clust[i])); end
for i in 1:maximum(clust[1]) println(i, "-", sum(clust[1] .== i)); end

d = 1; k = maximum(clust[d]);
TaxonClust = DataFrame(Taxon=names(edge_data), Module=clust[d]);
TaxonClust |> CSV.write("Result/TaxonCluster.tsv", delim='\t', writeheader=true)

# draw network
nw, new_pos, cnctdf, new_clust, score = SetNetwork(edge_data, clust[d], pos, il=collect(1:k));
save_object("Result/_nwdata.jld2", (nw, new_pos, cnctdf, new_clust, score, edge_data))
# (nw, new_pos, cnctdf, new_clust, score, edge_data) = load_object("Result/_nwdata.jld2"); k = maximum(new_clust);
DrawNetwork("Result/MG_AllNetWork.png", nw, new_pos, cnctdf, new_clust, k, node_scores=score, edge_mode=:ALL, edge_threshold=0.0, x_size=30, y_size=30)
DrawNetwork("Result/MG_AllNetWork_labeled.png", nw, new_pos, cnctdf, new_clust, k, node_scores=score, node_labels=sort(unique(vcat(cnctdf.e1,cnctdf.e2))), edge_mode=:ALL, edge_threshold=0.0, x_size=30, y_size=30)


# WGCNA

In [None]:
options(warn=-1)
Pkgs <- c("dplyr", "stringr", "WGCNA")
for(p in Pkgs) suppressMessages(library(p, character.only=T))
options(stringsAsFactors = FALSE)
# enableWGCNAThreads()

Data <- read.csv("Result/table.tsv", sep = "\t", skip=1)
names(Data)[1] <- "Taxon"
Data <- Data %>%
    filter(
            str_detect(Taxon, "g__"),
            !str_detect(Taxon, "g__uncultured"),
            !str_detect(Taxon, "g__Incertae")
    )
rownames(Data) <- str_split_fixed(Data$Taxon, ";g__", 2)[, 2]
Data <- Data %>% select(-Taxon)
Data <- t(Data)

Beta <- c(1:50)
Sft <- pickSoftThreshold(Data, powerVector=Beta)
pdf(paste0("Result/WGCNA_ScaleIndependence_MG.png"), width=960, height=480)
    plot(
            Sft$fitIndices[,1], -sign(Sft$fitIndices[,3])*Sft$fitIndices[,2],
            xlab="Soft Threshold (power)", ylab="Scale Free Topology Model Fit,signed R^2", type="n",
            main=paste("Scale independence")
        )
    text(
            Sft$fitIndices[,1], -sign(Sft$fitIndices[,3])*Sft$fitIndices[,2],
            labels=Beta, cex=0.9, col="red"
        )
    abline(h=0.90,col="red")
dev.off()

Beta <- 13
Adj <- adjacency(Data, power=Beta);
k <- as.vector(apply(Adj, 2, sum, na.rm=T))
png(paste0("Result/WGCNA_ScaleFreePlot_MG.png"), width=480, height=480)
    scaleFreePlot(k, main="Check scale free topology\n")
dev.off()

TOM <- TOMsimilarity(Adj)
DissTOM <- 1 - TOM
GeneTree = hclust(as.dist(DissTOM), method="average")
DeepSplit <- 4
MinModuleSize <- 10
DynamicMods <- cutreeDynamic(
                                dendro=GeneTree, 
                                distM=DissTOM,
                                deepSplit=DeepSplit, 
                                pamStage=TRUE,
                                pamRespectsDendro=TRUE,
                                minClusterSize=MinModuleSize
                            )
DynamicColors <- labels2colors(DynamicMods)
png(paste0("Result/WGCNA_Dendrogram_MG_1.png"), width=960, height=480)
    plotDendroAndColors(
                            dendro=GeneTree, 
                            colors=DynamicColors, 
                            groupLabels="Dynamic Tree Cut",
                            dendroLabels=FALSE, hang=0.03,
                            addGuide=TRUE, guideHang=0.05,
                            main="Gene dendrogram and module colors"
                        )
dev.off()

MEList <- moduleEigengenes(Data, colors=DynamicColors)
MEs <- MEList$eigengenes
MEDiss <- 1 - cor(MEs)
METree <- hclust(as.dist(MEDiss), method="average")
png(paste0("Result/WGCNA_ClusteringModule_MG.png"), width=960, height=480)
    plot(
            METree,
            main="Clustering of module eigengenes",
            xlab = "", sub = ""
        )
    MEDissThres <- 0.85
    abline(h=MEDissThres, col="red")
dev.off()

Merge <- mergeCloseModules(Data, DynamicColors, cutHeight=MEDissThres, verbose=3)
MergedColours <- Merge$colors
MergedMEs <- Merge$newMEs
png(paste0("Result/WGCNA_Dendrogram_MG_2.png"), width=960, height=480)
    plotDendroAndColors(
                            dendro=GeneTree,
                            colors=cbind(DynamicColors, MergedColours),
                            groupLabels=c("Dynamic Tree Cut", "Merged dynamic"),
                            dendroLabels=FALSE, hang=0.03,
                            addGuide=TRUE, guideHang=0.05
                        )
dev.off()

ColourList <- data.frame(colnames(Data), MergedColours)
colnames(ColourList) <- c("Symbol", "Colour")
write.table(ColourList, "Result/WGCNAR.tsv", sep='\t', row.names=F)


In [None]:
# sankey
using CSV, DataFrames
using SankeyMakie
using CairoMakie

SGCRNA = CSV.read("Result/TaxonCluster.tsv", header=1, comment="#", delim='\t', DataFrame);
WGCNA = CSV.read("Result/WGCNAR.tsv", header=1, comment="#", delim='\t', DataFrame);
rename!(WGCNA, 1 => "Taxon");
Data = outerjoin(SGCRNA, WGCNA, on=:Taxon);
Data = coalesce.(Data, "NA");
Data[!, :Module] = string.(Data.Module);
gdf = groupby(Data[:,[:Module ,:Colour]], [:Module ,:Colour]);
Data = combine(gdf, nrow);
sort!(Data);
Label = vcat(unique(Data.Module), unique(sort(Data.Colour)));

src = [findfirst(x -> x == i, Label) for i in Data.Module];
dst = [findfirst(x -> x == i, Label) for i in Data.Colour];

cnct = [(src[i], dst[i], Data.nrow[i]) for i in 1:length(src)];
f, ax, s = sankey(cnct, nodelabels=Label, axis=hidden_axis(), figure=(; size=(1000, 2000)))
# hidedecorations!(ax)
# hidespines!(ax)
ax.fontsize = 16
save("Result/MG_Sankey.png", f)


# 要素比較ヒートマップ

In [None]:
using CSV, DataFrames
using Colors, CairoMakie

SGCRNA = CSV.read("Result/TaxonCluster.tsv", header=1, comment="#", delim='\t', DataFrame);
WGCNA = CSV.read("Result/WGCNAR.tsv", header=1, comment="#", delim='\t', DataFrame);
N_Sgcrna = maximum(SGCRNA.Module)
ClustName = sort(unique(WGCNA.Colour))
N_Wgcna = length(ClustName)
Result = zeros(N_Wgcna, N_Sgcrna)
for c in 1:N_Sgcrna
    A = Set(SGCRNA.Taxon[SGCRNA.Module .== c])
    for c2 in 1:N_Wgcna
        B = Set(WGCNA.Symbol[WGCNA.Colour .== ClustName[c2]])
        Result[c2, c] = length(A ∩ B) / min(length(A), length(B))
    end
end

fig = Figure(size=(size(Result,1)*50, size(Result,2)*30), fontsize=16)
ax = Axis(fig[1, 1]; xlabel="WGCNA", ylabel="SGCRNA", title="", xgridvisible=false, ygridvisible=false, xticks=(1:size(Result,1), string.(ClustName)), xticklabelrotation=π/4, yticks=(1:size(Result,2), string.(1:size(Result,2))))
heatmap!(ax, Result, colormap=:heat)
Colorbar(fig[:, end+1], limits=(0, 1), colormap=:heat)
save("Result/ModuleSimilarity.pdf", fig)
