In [None]:
# https://socialgene.github.io/precomputed_databases/2023_v0.4.1/aws/aws/
# 314 genomes + all MIBiG BGCs
mkdir temp
cd temp
aws s3 --no-sign-request cp s3://socialgene-open-data/2023_v0.4.1/micromonospora/neo4j_db_micromonospora_base.dump .
wget 

mkdir hmm_models
cd hmm_models
wget https://socialgene-open-data.s3.amazonaws.com/2023_v0.4.1/hmm_models/socialgene_nr_hmms_file_with_cutoffs_1_of_1.hmm.gz
wget https://socialgene-open-data.s3.amazonaws.com/2023_v0.4.1/hmm_models/socialgene_nr_hmms_file_without_cutoffs_1_of_1.hmm.gz

In [None]:
dump_path="${PWD}/neo4j_db_micromonospora_base.dump"
sg_neoloc="${PWD}"
pipeline_version='latest'

# mkdir because the docker image will create directories as root if they don't exist

rm -rf $sg_neoloc/data $sg_neoloc/logs $sg_neoloc/plugins $sg_neoloc/conf $sg_neoloc/import
mkdir -p $sg_neoloc/data
mkdir -p $sg_neoloc/logs
mkdir -p $sg_neoloc/plugins
mkdir -p $sg_neoloc/conf
mkdir -p $sg_neoloc/import

docker run \
    --user=$(id -u):$(id -g) \
    --interactive \
    --tty \
    --rm \
    --volume=$sg_neoloc/data:/opt/conda/bin/neo4j/data \
    --volume=$sg_neoloc/plugins:/opt/conda/bin/neo4j/plugins \
    --volume=$sg_neoloc/logs:/opt/conda/bin/neo4j/logs \
    --volume=$dump_path:/opt/conda/bin/neo4j/neo4j.dump \
    --env NEO4J_AUTH=neo4j/test \
    chasemc2/sgnf-sgpy:1.2.2 \
        neo4j-admin database load \
            --from-path=. \
            neo4j

In [None]:
sg_neoloc=$PWD

NEO4J_server_memory_heap_initial__size='4600m'
NEO4J_server_memory_heap_max__size='4600m'
NEO4J_server_memory_pagecache_size='4g'

mkdir -p $sg_neoloc/conf
# Allow import and export of files from database
echo 'apoc.export.file.enabled=true' > $sg_neoloc/conf/apoc.conf
echo 'apoc.import.file.enabled=true' >> $sg_neoloc/conf/apoc.conf
echo 'apoc.export.file.use_neo4j_config=false' >> $sg_neoloc/conf/apoc.conf
echo 'apoc.import.file.use_neo4j_config=false' >> $sg_neoloc/conf/apoc.conf
# Set import/export of files from database to $sg_neoloc/import
echo 'server.directories.import=/opt/conda/bin/neo4j/import' >> $sg_neoloc/conf/neo4j.conf
echo 'server.directories.export=/opt/conda/bin/neo4j/import' >> $sg_neoloc/conf/neo4j.conf

docker run \
    --user=$(id -u):$(id -g) \
    -p7474:7474 -p7687:7687 \
    -v $sg_neoloc/data:/data \
    -v $sg_neoloc/logs:/logs \
    -v $sg_neoloc/import:/opt/conda/bin/neo4j/import \
    -v $sg_neoloc/plugins:/plugins \
    -v $sg_neoloc/conf:/opt/conda/bin/neo4j/conf \
        --env NEO4J_AUTH=neo4j/test12345 \
        --env NEO4J_PLUGINS='["apoc"]' \
        --env NEO4J_dbms_security_procedures_unrestricted=algo.*,apoc.*, \
        --env NEO4J_dbms_security_procedures_allowlist=algo.*,apoc.* \
        --env NEO4J_server_config_strict__validation_enabled=false \
        --env NEO4J_server_memory_heap_initial__size=$NEO4J_server_memory_heap_initial__size \
        --env NEO4J_server_memory_heap_max__size=$NEO4J_server_memory_heap_max__size \
        --env NEO4J_server_memory_pagecache_size=$NEO4J_server_memory_pagecache_size \
        --env NEO4J_server_jvm_additional='-XX:+ExitOnOutOfMemoryError' \
    neo4j:5.17.0

In [None]:
MATCH z1=(n:pfam {name:"Trp_halogenase"})<-[:SOURCE_DB]-(h1:hmm),
      z2=(h1)-[:ANNOTATES]-(:protein)<-[e1:ENCODES]-(n1:nucleotide)
WHERE n1.external_id STARTS WITH "BGC"
WITH DISTINCT n1, e1
CALL {
    WITH n1, e1
    MATCH z3=(an1:hmm_source:antismash)<-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(p1:protein)<-[e2:ENCODES]-(n1)
    MATCH z4=(an2:hmm_source:antismash)<-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(p1)
        WHERE an1.name ="Condensation"
            AND an2.name IN ["AMP-binding", "A-OX"] 
            AND abs(e1.start - e2.start) < 10000 
            AND e1.strand = e2.strand
    MATCH z5=(:hmm_source:amrfinder)<-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(p2:protein)<-[e3:ENCODES]-(n1)
    WHERE abs(e1.start - e3.start) < 50000 
            AND e1.strand = e3.strand
    RETURN z3, z4, z5
} in transactions of 1 rows
RETURN z1, z2, z3, z4, z5  
  

In [None]:
from socialgene.cli.search.sea import search_bgc
from socialgene.config import env_vars

env_vars["NEO4J_URI"] = "bolt://localhost:7687"

search_bgc(
    input="/home/chase/Downloads/BGC0001848.gbk",
    hmm_dir="/home/chase/Desktop/workshop/temp/hmm_models",
    outpath_clinker="/home/chase/Desktop/workshop/temp/data.json",
    use_neo4j_precalc=True,
    assemblies_must_have_x_matches=0.4,
    nucleotide_sequences_must_have_x_matches=0.4,
    gene_clusters_must_have_x_matches=0.4,
    break_bgc_on_gap_of=20000,
    target_bgc_padding=10000,
    max_domains_per_protein=3,
    max_outdegree=1000000,
    max_query_proteins=5,
    scatter=False,
    locus_tag_bypass_list=None,
    protein_id_bypass_list=["CAA60459.1"],
    only_culture_collection=False,
    frac=0.75,
    run_async=True,
    analyze_with="blastp",
)