In [1]:
import sys
import os
import time
from typing import List, Dict, Any, Optional

# from src_old.query_refinement import QueryRefinementEngine, QueryContext
# from sample_data import SampleDataGenerator
from database_schema import Paper
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from vector_database import VectorDatabase, parse_academic_paper, extract_authors, process_content, extract_behaviors, extract_breeds, determine_study_type
from database_schema import Paper, BehaviorCategories, CommonDogBreeds
import re
from datetime import datetime
from collections import defaultdict
import unicodedata
from unidecode import unidecode
import json
from langchain.document_loaders import PyPDFLoader
from pathlib import Path
import nltk
nltk.download('punkt', quiet=True)

%load_ext autoreload
%autoreload 2

In [None]:
pdf_path = "data_50_papers\Albuquerque et al. 2021.pdf"
store = False

pdf_path = Path(pdf_path)
if not pdf_path.exists():
    raise FileNotFoundError(f"PDF file not found: {pdf_path}")

parent_dir = pdf_path.parent
pdf_name = pdf_path.stem

loader = PyPDFLoader(pdf_path)
documents = loader.load()

if not documents:
    raise ValueError(f"No content extracted from {pdf_path}")

metadata = documents[0].metadata
num_pages = metadata.get('total_pages', len(documents))
full_content = "\n".join([doc.page_content for doc in documents])

paper_id = Path(pdf_path).stem
title = metadata.get('title', f"Paper from {Path(pdf_path).name}")
authors = extract_authors(documents)  

In [None]:
sections = process_content(full_content, num_pages)
sections['title']

In [None]:
if title is None or title == "":
    title = sections['title'].strip()

if store:
    # Store as json
    sections['title'] = title
    sections['metadata'] = metadata

    parent_dir_clean = parent_dir.with_name(parent_dir.name + "_clean")
    parent_dir_clean.mkdir(exist_ok=True)
    json_path = parent_dir_clean / pdf_name
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(sections, f, ensure_ascii=False, indent=4)

keywords = []
if 'keywords' in metadata:
    keywords = [kw.strip() for kw in metadata['keywords'].split(',')]

content_lower = full_content.lower()
detected_breeds = extract_breeds(content_lower)
detected_behaviors = extract_behaviors(content_lower)
study_type = determine_study_type(content_lower, keywords)

paper = Paper(
    id=paper_id,
    title=title,
    authors=authors,
    subject=metadata.get('subject', '').split(',')[0] if metadata.get('subject') else None,
    keywords=keywords,
    abstract=sections['abstract'],
    body=sections['body'],
    conclusion=sections['conclusion'],
    references=sections['references'],
    dog_breeds=detected_breeds,
    behavior_categories=detected_behaviors,
    study_type=study_type,
    publication_date=metadata.get('creationdate', ''),  # Could extract from metadata if available
    doi=metadata.get('doi'),
    # url=metadata.get('subject', '').split(',')[1] if metadata.get('subject') else None,
    metadata=metadata,
)      

In [None]:
content = getattr(paper, 'abstract', "")
content

In [2]:
vector_db = VectorDatabase(db_path="db_nomic",
                           collection_name="dog_behavior",
                           embedding_model="nomic-ai/nomic-embed-text-v1",
                           chunk_size=2)
# vector_db.clear_collection()
# vector_db.chunk_size = 2

In [4]:
# pdf_paths = list(Path("data_50_papers").glob("*.pdf"))
# print(list(pdf_paths))

# successful_papers = []
# for pdf_path in pdf_paths:
#     paper = parse_academic_paper(pdf_path, store=False)
#     successful_papers.append(paper)

success = vector_db.add_papers_from_directory("data_50_papers")
success
# vector_db.add_paper_sentences(successful_papers[0])
# vector_db.add_papers_sentences_batch(successful_papers[:3])

3995

In [5]:
vector_db.get_collection_stats(), vector_db.collection.count()

({'total_papers': 8904, 'behavior_categories': {}, 'study_types': {}}, 8904)

In [10]:
user_query = "Why does my dog bark so much when I leave the house?"
retreived_papers = vector_db.search(user_query, top_k=30)
# retreived_papers

In [None]:
for paper_id, paper_dict in vector_db.paper_dicts.items():
    print(f"Paper ID: {paper_id}")
    print(f"Number of chunks: {len(paper_dict['chunk_ids'])}")

In [None]:
vector_db.save_configs()

In [None]:
vector_db.get_paper_by_id('Albuquerque et al. 2021_abstract_0')

In [None]:
vector_db.delete_paper('Albuquerque et al. 2021')

In [None]:
vector_db.update_paper(paper)

In [None]:
vector_db.paper_dicts

# LLM

In [None]:
import ollama

# Create client with explicit host
client = ollama.Client(host='http://localhost:11434')
try:
    models = client.list()
    print("Connected successfully!")
    print(models)
except Exception as e:
    print(f"Still can't connect: {e}")

In [None]:
model_name = "llama3.2"
client.show(model_name)
print(f"LLM model {model_name} is available")

In [18]:
# def _prepare_document_context(documents) -> str:
#         """Prepare a concise context string from retrieved documents."""
#         if not documents:
#             return "No documents retrieved."
            
#         # First we group the documents by their source paper
#         papers = {}
#         for doc in documents:
#             metadata = doc.get('metadata', {})
#             paper_id = metadata.get('paper_id', 'unknown')
#             if paper_id not in papers:
#                 papers[paper_id] = {
#                     'title': doc.get('title', 'Unknown Title'),
#                     'authors': doc.get('authors', 'Unknown Authors'),
#                     'content_chunks': []
#                 }
#             # papers[paper_id].append(doc)
#             papers[paper_id]['content_chunks'].append({'chunk_idx': metadata.get('chunk_index', 10000), 
#                                                        'text': doc.get('document', '')})

#         # Sort each list by chunk index
#         for paper_idx, docs in papers.items():
#             docs = sorted(docs['content_chunks'], key=lambda x: x['chunk_idx'])
#             final_text = ""
#             for i in range(len(docs)):
#                 final_text += docs[i]['text']
#                 final_text += "[...]" if i != len(docs) or docs[i+1]['chunk_idx'] != docs[i]['chunk_idx'] else ""
#                 final_text += "\n"
#             papers[paper_idx]['content_chunks'] = final_text.strip()
        
#         # concatenate the text from all documents
#         context_parts = []
#         for paper_id, paper_info in papers.items():
#             context_parts.append(f"**Paper: {paper_info['title']}**\n{paper_info['content_chunks']}\n")
        
#         return '\n'.join(context_parts).strip()

In [19]:
# print(_prepare_document_context(retreived_papers))

In [None]:
# def _create_question_prompt(user_query: str, doc_context: str) -> str:
#     """Create prompt for analyzing query and extracting filters."""
#     return f"""# Dog Behavior Research Query Analysis

# ## Your Role
# You are an expert in dog behavior research with deep knowledge of academic literature analysis. Your task is to analyze a user's research question and relevant document excerpts to generate precise clarifying questions.

# **IMPORTANT: The questions you generate will be presented directly to the user to help refine their search. Make them user-friendly and easy to understand.**

# ## Input Data
# **User Query:** "{user_query}"

# **Retrieved Research Context:**
# ```
# {doc_context}
# ```

# ## Task Objective
# Generate clarifying questions that will be asked back to the user to improve their search results. These questions should help narrow down their research focus based on what you see in the retrieved documents.

# ## Question Generation Rules
# - Ask ONLY if clarification would significantly narrow search results
# - Base questions on patterns you see in the retrieved documents
# - Maximum 5 questions, prioritize most impactful
# - Use simple, direct language that a researcher would understand
# - If query is already specific enough, ask no questions
# - Make questions answerable in 1-2 words or a short phrase
# - Frame questions as if speaking directly to the user

# ## Examples of Good Questions:
# - "Which specific age group are you focusing on?" (with options: puppy, adult, senior)
# - "Are you interested in a particular breed type?" (with options: working dogs, toy breeds, etc.)
# - "What research methodology do you prefer?" (with options: experimental, observational, review)

# ## Required Output Format
# Respond with a JSON array where each question will be presented to the user:

# [
#     {{
#         "question": "Which age group are you most interested in?",
#         "options": ["puppies (0-1 year)", "adults (1-7 years)", "seniors (7+ years)", "all ages"],
#         "priority": 1,
#         "reasoning": "Age significantly affects facial recognition behavior in dogs"
#     }}
# ]

# If no clarifying questions are needed, return an empty array: []

# Generate user-facing questions now:"""

In [None]:
# doc_context = _prepare_document_context(retreived_papers)
# print(doc_context)

In [None]:
# analysis_prompt = _create_question_prompt(user_query, doc_context)
# print(analysis_prompt)

In [None]:
# response = client.chat(
#     model=model_name,
#     messages=[{"role": "user", "content": analysis_prompt}]
# )
# print(response['message']['content'])

In [None]:
# from pydantic import BaseModel
# class DynamicQuestion(BaseModel):
#     """A targeted question generated by LLM to gather missing information."""
#     question: str
#     # filter_type: str  # behavior, breed, age, study_type, methodology, sample_size
#     options: List[str] = []
#     priority: int = 1  # 1 = high, 2 = medium, 3 = low
#     reasoning: str = ""  # Why this question is important

# def generate_targeted_questions(user_query: str, retrieved_documents: List[Dict[str, Any]]) -> List[Dict[str, str]]:
#     """
#     Analyze user query along with retrieved documents to extract relevant filtering questions.
    
#     Args:
#         user_query: The original user query
#         retrieved_documents: List of documents retrieved from initial search
        
#     Returns:
#         List of prioritized questions to ask the user
#     """
#     doc_context = _prepare_document_context(retrieved_documents)
#     analysis_prompt = _create_question_prompt(user_query, doc_context)
    
#     try:
#         response = client.chat(
#             model=model_name,
#             messages=[{"role": "user", "content": analysis_prompt}]
#         )
#         response_text = _parse_response(response['message']['content'])
#         data = json.loads(response_text)

#         questions = []
#         for item in data:
#             questions.append(DynamicQuestion(
#                 question=item.get('question', ''),
#                 options=item.get('options', []),
#                 priority=int(item.get('priority', 3)),
#                 reasoning=item.get('reasoning', '')
#             ))
            
#         return sorted(questions, key=lambda x: x.priority)
        
#     except Exception as e:
#         print(f"Error generating questions: {e}")
#         return []
    

# def _parse_response(response_text: str) -> List[DynamicQuestion]:
#     """Parse LLM response for filter extraction."""
#     try:
#         # Clean response text
#         response_text = response_text.strip()
        
#         # Remove reasoning process if present (between <think> and </think>)
#         if '<think>' in response_text and '</think>' in response_text:
#             start_idx = response_text.find('<think>')
#             end_idx = response_text.find('</think>') + len('</think>')
#             response_text = response_text[:start_idx] + response_text[end_idx:]
#             response_text = response_text.strip()
#             print("Removed reasoning process from LLM response")
        
#         # Remove code block markers
#         if response_text.startswith('```json'):
#             response_text = response_text[7:-3]
#         elif response_text.startswith('```'):
#             response_text = response_text[3:-3]
        
#         return response_text.strip()
        
#     except Exception as e:
#         print(f"Error parsing LLM response: {e}")
#         return []

In [None]:
# generate_targeted_questions(user_query, retreived_papers)

In [None]:
# qa = {'Which age group are you most interested in?': ['puppies (0-1 year)', 'adults (1-7 years)'], 'What specific emotional expressions do you want to investigate?': ['positive emotions'], 'Are you looking for a specific breed or type of dogs?': ['herding breeds'], 'Do you have a specific research methodology in mind?': ['experimental studies']}

In [None]:
# qa_text = "\n".join([f"{q}: {', '.join(a)}" for q, a in qa.items()])
# print(qa_text)

In [None]:
# def _create_improvement_prompt(original_query: str, answers: Dict[str, str]) -> str:
#     """Create prompt for improving query with user answers."""
#     qa_text = "\n".join([f"{q}: {', '.join(a)}" for q, a in answers.items()])
    
#     return f"""Improve this research query by incorporating the user's answers to clarifying questions.

# Original Query: "{original_query}"

# User's Clarifications:
# {qa_text}

# Create an improved, more specific query that incorporates the user's clarifications while maintaining the original intent. The improved query should be natural and search-friendly.

# Respond with only the improved query text, no additional formatting or explanation."""

In [None]:
# improvement_prompt = _create_improvement_prompt(user_query, qa)
# print(improvement_prompt)

In [None]:
# response = client.chat(
#     model= model_name,
#     messages=[{"role": "user", "content": improvement_prompt}]
# )
# improved_query = _parse_response(response['message']['content'])
# print(improved_query)

In [None]:
# improved_query = "In experimental studies on herding breeds, which part of the human face do puppies (0–1 year) and adult dogs (1–7 years) focus on the most when viewing faces expressing positive emotions?"
# vector_db.search(improved_query, n_results=5)

In [None]:
# vector_db.search("How can I reduce separation anxiety-related barking in an 8-month-old Labrador who barks after the owner leaves, especially during the first hour, and shows signs of stress when the owner prepares to leave?", n_results=10)

# Answer Assessment

In [29]:
from llm_service import LLMDynamicRAG
# from interactive_llm_demo import LLMRAGDemo
from vector_database import VectorDatabase
%autoreload 2

In [38]:
vector_db = VectorDatabase(db_path="db_nomic",
                           collection_name="dog_behavior",
                           embedding_model="nomic-ai/nomic-embed-text-v1",
                           chunk_size=2)

rag = LLMDynamicRAG(
    vector_db=vector_db,
    query_analyzer_model="mistral:7b-instruct",
    answer_model="mistral:7b-instruct",
)

### Local Model

In [65]:
user_query = "Why does my dog bark so much when I leave the house?"
results = await rag.generate_direct_answer(user_query, 30)
print(results['answer'])

1. **Direct Answer**: The papers you've shared provide valuable insights into the behavior and responses of dogs to various stimuli. It appears that dogs may express stress, fear, or anxiety in response to certain sounds, such as household noises or high-frequency pest repellants.

2. **Research Evidence**: According to one study, common household noises might cause fear and anxiety in companion dogs (Stress-Related Behaviors in Companion Dogs Exposed to Common Household Noises, and Owners' Interpretations of Their Dogs' Behaviors). Another paper reports that existing research focuses on dramatic infrequent sounds like thunderstorms and fireworks, but many common household noises may also be causing fear and anxiety in resident dogs (Stress-Related Behaviors in Companion Dogs Exposed to Common Household Noises, and Owners' Interpretations of Their Dogs' Behaviors).

3. **Practical Advice**: If your dog seems to be stressed or anxious due to household noises, it might help to minimize t

In [66]:
improved_results = await rag.process_query(user_query, 30)
improved_results['questions']

[DynamicQuestion(question='Are you focusing on specific sounds produced by dogs (e.g., approach, withdrawal) or a particular context (e.g., play, defense, loneliness)?', options=['approach/withdrawal', 'specific context'], priority=1, reasoning='This will help narrow down the search results to relevant studies'),
 DynamicQuestion(question='What species are you primarily interested in, dogs or wolves?', options=['dogs', 'wolves'], priority=2, reasoning='Some research may focus on both species, but specific findings may be more relevant to one'),
 DynamicQuestion(question='Are you interested in the effects of common household noises on dogs?', options=['yes', 'no'], priority=3, reasoning='If so, this will help filter out studies focusing on dramatic infrequent sounds'),
 DynamicQuestion(question='What type of research are you looking for (e.g., experimental, observational, review)?', options=['experimental', 'observational', 'review'], priority=4, reasoning='This can help determine the n

In [None]:
# use the questiions and options to make a dictionary of questions-answers
# choose one or more answers randomly
import random
def choose_answers(questions: List[Dict[str, Any]]) -> Dict[str, str]:
    """Randomly choose answers from the provided questions."""
    chosen_answers = {}
    for question in questions:
        if 'options' in question and question['options']:
            chosen_answer = random.choice(question['options'])
            chosen_answers[question['question']] = chosen_answer
    return chosen_answers

chosen_answers = choose_answers(improved_results['questions'])
print(chosen_answers)

NameError: name 'improved_results' is not defined

In [68]:
final_answer = await rag.finalize_response(improved_results, answers)
print(final_answer['improved_query'])
print(final_answer['final_answer'])

"Why does my pet dog bark excessively when I leave the house, specifically in response to human presence?"
1. Direct Answer: It appears that excessive barking in dogs can be associated with a variety of emotions and behaviors, including excitement, boredom, anxiety, and pain, as suggested by the studies you provided.

2. Research Evidence: Higher pitched barks combined with longer bark sequences are often linked to happiness and playfulness (Yin and McCowan, 2004; Pongracz et al., 2005, 2006), but excessive barking can indicate other emotions such as anxiety or pain.

3. Practical Advice: To help reduce excessive barking, it's important to identify the underlying cause. This could involve providing more mental and physical stimulation, reducing boredom, managing anxiety-provoking situations, or addressing any health issues that might be causing discomfort.

4. Important Considerations: If your dog's barking seems excessive or is accompanied by other distress signals (such as destructiv

### GPT Assessments

In [54]:
retrieved_documents = rag.vector_db.search(user_query, top_k=30)
doc_context = rag.answer_generator._prepare_document_context(retrieved_documents)
answer_prompt = rag.answer_generator._create_answer_prompt(
    user_query, None, doc_context, None
)
print(answer_prompt)

# Dog Behavior Expert Response

## Your Mission
You are a caring dog behavior expert helping a dog owner with their question. Provide a comprehensive, evidence-based answer that is both scientifically accurate and practically helpful.

## User's Question
**Original Question:** "Why does my dog bark so much when I leave the house?"
None

## Available Research Evidence
**Paper: Where Do We Stand in the Domestic Dog (Canis familiaris) Positive-Emotion Assessment: A State-of-the-Art Review and Future Directions**
Whining and yelping sounds were recorded when the owner returned after separation, suggesting increased positive arousal and motivation to approach the owner (Rehn et al., 2014). Another study found that early onset of whining after short separation from the owner was the most typical vocal response of dogs diagnosed with a separation-related disorder.[...]
Should I whine or should I bark? Qualitative and quantitative differences between the vocalizations of dogs with and without 

In [56]:
doc_context = rag.query_analyzer._prepare_document_context(retrieved_documents)
analysis_prompt = rag.query_analyzer._create_question_prompt(user_query, doc_context)
print(analysis_prompt)

# Dog Behavior Research Query Analysis

## Your Role
You are an expert in dog behavior research with deep knowledge of academic literature analysis. Your task is to analyze a user's research question and relevant document excerpts to generate precise clarifying questions.

**IMPORTANT: The questions you generate will be presented directly to the user to help refine their search. Make them user-friendly and easy to understand.**

## Input Data
**User Query:** "Why does my dog bark so much when I leave the house?"

**Retrieved Research Context:**
```
**Paper: Where Do We Stand in the Domestic Dog (Canis familiaris) Positive-Emotion Assessment: A State-of-the-Art Review and Future Directions**
Whining and yelping sounds were recorded when the owner returned after separation, suggesting increased positive arousal and motivation to approach the owner (Rehn et al., 2014). Another study found that early onset of whining after short separation from the owner was the most typical vocal response

In [60]:
questions_answers = {
  "How long is your dog typically left alone when the barking occurs?": ["30 minutes to 2 hours"],
  "Has your dog shown other signs besides barking when left alone?": ["Yes, destructive behavior"],
  "Did this barking behavior start after a specific change in routine or household?": ["Yes, after a change"],
  "Is your dog from a breed known for high vocal tendencies?": ["Yes"],
  "Does your dog also bark when you leave the room, not just the house?": ["Sometimes"]
}

In [61]:
improvement_prompt = rag.query_analyzer._create_improvement_prompt(user_query, questions_answers)
print(improvement_prompt)

Improve this research query by incorporating the user's answers to clarifying questions.

Original Query: "Why does my dog bark so much when I leave the house?"

User's Clarifications:
How long is your dog typically left alone when the barking occurs?: 30 minutes to 2 hours
Has your dog shown other signs besides barking when left alone?: Yes, destructive behavior
Did this barking behavior start after a specific change in routine or household?: Yes, after a change
Is your dog from a breed known for high vocal tendencies?: Yes
Does your dog also bark when you leave the room, not just the house?: Sometimes

Create an improved, more specific query that incorporates the user's clarifications while maintaining the original intent. The improved query should be natural and search-friendly.

Respond with only the improved query text, no additional formatting or explanation.


In [62]:
improved_query = "Why does my dog, a vocal breed, bark excessively and show destructive behavior when left alone for 30 minutes to 2 hours, especially since this started after a recent routine change and sometimes happens even when I leave the room?"
retrieved_documents = rag.vector_db.search(improved_query, top_k=30)

In [64]:
doc_context = rag.answer_generator._prepare_document_context(retrieved_documents)
answer_prompt = rag.answer_generator._create_answer_prompt(
    user_query, improved_query, doc_context, answers)
print(answer_prompt)

# Dog Behavior Expert Response

## Your Mission
You are a caring dog behavior expert helping a dog owner with their question. Provide a comprehensive, evidence-based answer that is both scientifically accurate and practically helpful.

## User's Question
**Original Question:** "Why does my dog bark so much when I leave the house?"
**Refined Focus:** "Why does my dog, a vocal breed, bark excessively and show destructive behavior when left alone for 30 minutes to 2 hours, especially since this started after a recent routine change and sometimes happens even when I leave the room?"

**Additional Context from User:**
- Are you focusing on wild or domestic canids?: domestic canids
- What specific behavior are you interested in regarding canid vocalizations?: approach-eliciting
- Are you interested in specific calltypes produced by canids?: no
- Are you interested in comparing dog vocalizations to those of wolves?: no
- Are you interested in studying canid vocalizations in response to specif