### Microbial pangenomics
- want to compare the results from viral pang to bacterial
- take 5 species, that are common in most metaGs
- cross them with the 10 test metaGs


Lactobacillus amylovorus, Limosilactobacillus reuteri, Escherichia coli, RUG472 sp900545265, Bariatricus sp004560705


In [None]:
# link dbs
ln -s /group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214-k21.zip .
ln -s /group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214.lineages.csv .

In [None]:

 
sourmash tax grep -i 'Blautia_A sp003471165' \
-t gtdb-rs214.lineages.csv -r species -o s__ecoli.csv
sourmash tax grep -i 'lactobacillus amylovorus' \
-t gtdb-rs214.lineages.csv -r species -o s__lamylovorus.csv
sourmash tax grep -i 'limosilactobacillus reuteri' \
-t gtdb-rs214.lineages.csv -r species -o s__lreuteri.csv
sourmash tax grep -i 'RUG472 sp900545265' \
-t gtdb-rs214.lineages.csv -r species -o s__RUG472.csv
sourmash tax grep -i 'Bariatricus sp004560705' \
-t gtdb-rs214.lineages.csv -r species -o s__bariatricus.csv

In [None]:
sourmash sig cat --picklist s__ecoli.csv:ident:ident \
gtdb-rs214-k21.zip -o gtdb.ecoli.k21.sig.zip

In [None]:
# Snakefile
import os
import pandas as pd

# Define samples
CLUSTERS, = glob_wildcards('../results/microbial_pangenomic_test/s__{ident}.csv')
METAGS = ['SRR8960980', 'SRR8960440', 'SRR8960721', 'SRR8960326', 'SRR8960915', 'SRR8960631', 'SRR11125655', 'SRR8960731', 'SRR8960200', 'SRR8960303']

GTDB = '/group/ctbrowngrp/sourmash-db/gtdb-rs214/gtdb-rs214-k21.zip'


wildcard_constraints:
    sample='\w+',

rule all:
    input:
        expand('../results/microbial_pangenomic_test/classify/{ident}x{metag}.txt', ident=CLUSTERS, metag=METAGS),
        expand('../results/microbial_pangenomic_test/classify/{ident}.pang.txt', ident=CLUSTERS),


# filter contigs per cluster
rule pick_contigs:
    input: 
        picklist = "../results/microbial_pangenomic_test/s__{ident}.csv"
    output: 
        sig = "../results/microbial_pangenomic_test/{ident}.zip"
    conda: 
        "branchwater"
    shell:
        """
        sourmash sig cat --picklist {input.picklist}:ident:ident \
        {GTDB} -o {output.sig} 
        """

# merge sketches into pangenome
rule pang_merge:
    input:
       sig='../results/microbial_pangenomic_test/{ident}.zip',
    output:
        pang = "../results/microbial_pangenomic_test/{ident}.pang.sig.gz",
    conda: 
        "branchwater"
    threads: 1
    shell:
        """ 
        sourmash scripts pangenome_merge \
        {input.sig} -o {output.pang} -k 21 
        """

# create a ranktable
rule ranktable:
    input:
       sig='../results/microbial_pangenomic_test/{ident}.pang.sig.gz',
    output:
        rankt = "../results/microbial_pangenomic_test/{ident}.rankt.csv",
    conda: 
        "branchwater"
    threads: 1
    shell:
        """ 
        sourmash scripts pangenome_ranktable {input.sig} \
        -o {output.rankt} -k 21
        """

# Now do the pangenome classify for 10 metagenomes
rule classify:
    input:
       rankt = "../results/microbial_pangenomic_test/{ident}.rankt.csv",
       metag = "../results/sourmash/sketches/read_s100/{metag}.sig.gz" 
    output:
        txt = "../results/microbial_pangenomic_test/classify/{ident}x{metag}.txt",
    conda: 
        "branchwater"
    threads: 1
    shell:
        """ 
        sourmash scripts pangenome_classify \
        {input.metag} {input.rankt} -k 21  > {output.txt}
        """
rule classify_pang:
    input:
       rankt = "../results/microbial_pangenomic_test/{ident}.rankt.csv",
       sig = "../results/microbial_pangenomic_test/{ident}.pang.sig.gz" 
    output:
        txt = "../results/microbial_pangenomic_test/classify/{ident}.pang.txt",
    conda: 
        "branchwater"
    threads: 1
    shell:
        """ 
        sourmash scripts pangenome_classify \
        {input.sig} {input.rankt} -k 21 > {output.txt}
        """

In [None]:
# add new MAGs from pig so we can see the pangenome sketches from those
# 7 matches
sourmash sig extract \
 --picklist F_prausnitzii_new.txt:user_genome:name \
 MAGs.drep999.zip -o F_prausnitzii_new.zip

# 43 matches
sourmash sig extract \
 --picklist E_coli_new.txt:user_genome:name \
 MAGs.drep999.zip -o E_coli_new.zip

sourmash sig cat Faecalibacterium_prausnitzii_E.zip F_prausnitzii_new.zip -o F_prausnitzii.all.zip
sourmash sig cat Escherichia_coli.zip E_coli_new.zip -o E_coli_.all.zip

In [None]:
sourmash scripts pangenome_ranktable E_coli_new.zip -o E_coli_new.rankt.csv -k 21
sourmash scripts pangenome_ranktable F_prausnitzii_new.zip -o F_prausnitzii.new.rankt.csv -k 21


python ../../../workflow/scripts/calc-hash-presence.py E_coli_new.rankt.csv \
../../../workflow/input_files_pig_100.txt --scaled=1000 -k 21 -o E_coli_new.x.pig.dump

python ../../../workflow/scripts/calc-hash-presence.py E_coli_new.rankt.csv \
../../../workflow/input_files_human.txt --scaled=1000 -k 21 -o E_coli_new.x.human.dump

python ../../../workflow/scripts/calc-hash-presence.py \
F_prausnitzii.rankt.csv ../../../workflow/input_files_pig_100.txt \
--scaled=1000 -k 21 -o F_prausnitzii.x.pig.dump


python ../../../workflow/scripts/parse-dump.py \
--dump-files-1 E_coli_new.x.human.dump \
--dump-files-2 E_coli_new.x.pig.dump > E_coli_new.cmp.tsv

In [None]:
python scripts/calc-hash-presence.py ../results/pangenome/pangenome_sketch/Lactobacillus_amylovorus.rankt.csv \
input_files_pig_100.txt --scaled=1000 -k 21 \
-o ../results/pangenome/dmp_pig/Lactobacillus_amylovorus.x.pig.dump

python ../../workflow/scripts/parse-dump.py \
--dump-files-1 dmp_human/Lactobacillus_amylovorus.x.human.dump \
--dump-files-2 dmp_pig/Lactobacillus_amylovorus.x.pig.dump > dmp/Lactobacillus_amylovorus.cmp.tsv


In [None]:
# one more for all the ecoli genomes
sourmash scripts pangenome_ranktable E_coli.all.zip -o E_coli.all.rankt.csv -k 21

python ../../../workflow/scripts/calc-hash-presence.py E_coli.all.rankt.csv \
../../../workflow/input_files_pig_100.txt --scaled=1000 -k 21 -o E_coli_all.x.pig.dump && \
python ../../../workflow/scripts/calc-hash-presence.py E_coli.all.rankt.csv \
../../../workflow/input_files_human.txt --scaled=1000 -k 21 -o E_coli_all.x.human.dump && \
python ../../../workflow/scripts/parse-dump.py \
--dump-files-1 E_coli_all.x.human.dump \
--dump-files-2 E_coli_all.x.pig.dump > E_coli_all.cmp.tsv