In [5]:
import sys
sys.path.append("/home/tagore/repos/ai/scripts")
import os
import time
import ollama
import chromadb
from generate_embeddings import *

In [6]:
# Example usage
documents_directory = '/home/tagore/repos/ai/data/processed/processed_texts/Bookshelf_NBK279690.txt'
collection_name = "blast_db"
database_file = '/home/tagore/repos/ai/data/processed/embeddings/chrome_db'
collection = create_embeddings_and_store_in_chroma(documents_directory, collection_name, database_path=database_file)

UniqueConstraintError: Collection blast_db already exists

In [17]:
documents_directory2 = '/home/tagore/repos/ai/data/processed/texts/Bookshelf_NBK279690.pdf'
collection_name2 = "blast_db_unprocessed"
database_file2 = '/home/tagore/repos/ai/data/processed/embeddings/chrome_db_unprocessed'
collection = create_embeddings_and_store_in_chroma(documents_directory2, collection_name2, database_path=database_file2)

Processed document 1/105. Elapsed Time: 13.06 seconds. Estimated Time Left: 1357.89 seconds.
Processed document 2/105. Elapsed Time: 7.84 seconds. Estimated Time Left: 807.98 seconds.
Processed document 3/105. Elapsed Time: 1.42 seconds. Estimated Time Left: 144.76 seconds.
Processed document 4/105. Elapsed Time: 0.36 seconds. Estimated Time Left: 36.50 seconds.
Processed document 5/105. Elapsed Time: 1.18 seconds. Estimated Time Left: 118.08 seconds.
Processed document 6/105. Elapsed Time: 17.08 seconds. Estimated Time Left: 1690.73 seconds.
Processed document 7/105. Elapsed Time: 7.69 seconds. Estimated Time Left: 753.89 seconds.
Processed document 8/105. Elapsed Time: 7.78 seconds. Estimated Time Left: 754.52 seconds.
Processed document 9/105. Elapsed Time: 8.86 seconds. Estimated Time Left: 850.40 seconds.
Processed document 10/105. Elapsed Time: 0.69 seconds. Estimated Time Left: 65.87 seconds.
Processed document 11/105. Elapsed Time: 8.09 seconds. Estimated Time Left: 760.40 seco

In [21]:
client = chromadb.PersistentClient(path=database_file)
collection = client.get_collection(name=collection_name)
client2 = chromadb.PersistentClient(path=database_file2)
collection2 = client2.get_collection(name=collection_name2)

In [22]:
blast_dna_questions = [
    "Which BLAST database (e.g., nt, nr, refseq_genomic) is most suitable for my DNA sequence search?",
    "What is the difference between nucleotide (nt) and protein (nr) databases in BLAST, and when should I use each?",
    "How often are the BLAST databases updated, and how can I access the latest versions?",
    "What parameters should I consider when setting up a BLAST search for DNA sequences?",
    "How do I optimize BLAST parameters (e.g., word size, gap penalties) for DNA sequence searches?",
    "What are the implications of choosing different BLAST algorithms (e.g., BLASTn, BLASTx) for DNA sequence queries?",
    "How do I interpret the BLAST results to understand the significance of matches?",
    "What are E-values, bit scores, and identity percentages in BLAST results, and how do they influence my interpretation?",
    "How can I filter and prioritize BLAST results to focus on the most relevant matches?",
    "How can I download specific subsets of BLAST databases for local searches?",
    "What tools or resources are available for managing and updating local copies of BLAST databases?",
    "How do I troubleshoot issues related to database compatibility or updates?",
    "How can I perform batch or high-throughput searches using BLAST databases?",
    "Are there specialized databases or tools within NCBI for specific types of DNA sequence searches (e.g., metagenomic databases)?",
    "What are the best practices for integrating BLAST search results into downstream analyses or workflows?",
    "What developments or updates are planned for NCBI BLAST databases and tools that may impact my research?",
    "How can I leverage cloud-based or distributed computing resources for large-scale DNA sequence searches using BLAST?"
]

In [15]:
# an example prompt
prompt = blast_dna_questions[0]

# generate an embedding for the prompt and retrieve the most relevant doc
response = ollama.embeddings(
  prompt=prompt,
  model="mxbai-embed-large"
)
results = collection.query(
  query_embeddings=[response["embedding"]],
  n_results=3
)
data = results['documents'][0][0]

In [16]:
# generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model="llama3",
  prompt=f"Using this data: {data}. Respond to this prompt: {prompt}"
)

print(output['response'])

A great question about filtering and prioritizing BLAST results!

When dealing with large numbers of BLAST hits, it's essential to focus on the most relevant matches that are likely to be biologically meaningful. Here are some strategies to help you filter and prioritize your BLAST results:

**1. Use E-value threshold**: E-values represent the probability of observing a match by chance. Set a reasonable E-value threshold (e.g., 1e-5 or lower) to filter out low-scoring matches that are unlikely to be significant.

**2. Adjust the Expectation value (E) and Maximum Number of Targets to Report (Max Target)**: In the BLAST parameters, you can set the maximum number of targets to report (`Max Target`) and the expectation value (`E`). For example, set `E` to 1e-5 and `Max Target` to 10 or 20. This will give you a more manageable list of hits.

**3. Filter by Sequence Identity**: Use the `seqid` filter (available in some BLAST tools) to retain only matches with a certain percentage sequence id