In [1]:
import os
from sentence_transformers import SentenceTransformer, util
import torch
import json
import re
import random 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def chunk_lines(lines, window_size=5):  # Increased window size for better context
    chunks = []
    indices = []
    for i in range(len(lines) - window_size + 1):
        chunk = lines[i:i + window_size]
        chunks.append(" ".join(chunk))
        indices.append((i, i + window_size))  # start, end index
    return chunks, indices

In [3]:
# Read text files and prepare chunks
def read_text_files_from_directory(directory_path, chunk_size=5):
    chapters = []
    original_lines = {}  # for retrieving original text

    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            chapter_name = os.path.splitext(filename)[0]
            filepath = os.path.join(directory_path, filename)

            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
            except UnicodeDecodeError:
                with open(filepath, 'r', encoding='latin-1') as f:
                    lines = f.readlines()

            clean_lines = [line.strip() for line in lines if line.strip()]
            original_lines[chapter_name] = clean_lines
            line_chunks, chunk_indices = chunk_lines(clean_lines, window_size=chunk_size)
            chapters.append((chapter_name, line_chunks, chunk_indices))

    return chapters, original_lines

In [4]:
# Encode all chapter chunks
def encode_chapters(model, chapters):
    encoded_chapters = []
    for chapter_name, chunks, indices in chapters:
        embeddings = model.encode(chunks, convert_to_tensor=True)
        encoded_chapters.append((chapter_name, chunks, indices, embeddings))
    return encoded_chapters

In [5]:
# Find best matches (expanded to return top k matches)
def find_best_matches(query, encoded_chapters, original_lines, model, top_k=3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    all_matches = []
    
    for chapter_name, chunks, indices, embeddings in encoded_chapters:
        cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
        
        for i in range(len(chunks)):
            score = cosine_scores[i].item()
            start, end = indices[i]
            lines_in_chunk = original_lines[chapter_name][start:end]
            
            # Find best line within chunk
            line_embeddings = model.encode(lines_in_chunk, convert_to_tensor=True)
            line_scores = util.pytorch_cos_sim(query_embedding, line_embeddings)[0]
            best_line_idx = torch.argmax(line_scores).item()
            best_line = lines_in_chunk[best_line_idx]
            
            all_matches.append({
                'chapter': chapter_name,
                'context': lines_in_chunk,
                'best_line': best_line,
                'best_line_idx': best_line_idx,
                'score': score
            })
    
    # Sort by score and get top k
    top_matches = sorted(all_matches, key=lambda x: x['score'], reverse=True)[:top_k]
    return top_matches

In [6]:
# Load or create response templates
def load_response_templates():
    templates_file = "response_templates.json"
    
    if os.path.exists(templates_file):
        with open(templates_file, 'r') as f:
            return json.load(f)
    else:
        # Default templates
        templates = {
            "freeform": {
                "greeting": [
                    "Hello! I'm your Harry Potter Storybook AI. What would you like to know about the first four books?",
                    "Welcome to the wizarding world! Ask me anything about Harry Potter's adventures in the first four books!"
                ],
                "character": [
                    "In the Harry Potter series, {character} is described as {context}",
                    "From what I recall about {character}, {context}",
                    "The books portray {character} as {context}"
                ],
                "event": [
                    "That event unfolds like this: {context}",
                    "Here's what happened: {context}",
                    "In the story, this is how it occurred: {context}"
                ],
                "place": [
                    "{place} is described in the books as {context}",
                    "The books describe {place} as {context}",
                    "From the Harry Potter series, {place} is {context}"
                ],
                "general": [
                    "Based on the books, {context}",
                    "According to the Harry Potter series, {context}",
                    "From what I recall from the books, {context}"
                ],
                "not_found": [
                    "I don't seem to recall that from the first four Harry Potter books. Could you rephrase your question?",
                    "That information might be in later books, or I might not have that detail in my memory. Could you ask something else?",
                    "I'm not finding that in my knowledge of the first four Harry Potter books. Would you like to ask about something else?"
                ]
            },
            "structured": {
                "header": "📚 HARRY POTTER BOOK ANALYSIS 📚\n\nQuery: \"{query}\"\n",
                "match": "📖 Match {index}: {chapter}\nRelevance: {score}%\n\nContext:\n{context}\n",
                "analysis": "🔍 ANALYSIS:\n{analysis}\n",
                "not_found": "No relevant information found in the first four Harry Potter books."
            }
        }
        
        # Save templates for future use
        with open(templates_file, 'w') as f:
            json.dump(templates, f, indent=2)
        
        return templates

In [7]:
# Generate analysis for structured response
def generate_analysis(query, matches):
    query_lower = query.lower()
    
    if re.search(r'\b(who|character)\b', query_lower):
        return "Based on these passages, we can understand this character's role and personality in the story. The text reveals their motivations and relationship to other characters in the Harry Potter universe."
    
    elif re.search(r'\b(what happened|event|scene|when)\b', query_lower):
        return "The sequence of events described here shows how this moment fits into the larger narrative of the Harry Potter series. It demonstrates cause and effect within the story's progression."
    
    elif re.search(r'\b(where|location|place)\b', query_lower):
        return "The description of this location helps establish the setting and atmosphere of the wizarding world. J.K. Rowling's attention to spatial detail contributes to the immersive quality of the Harry Potter universe."
    
    else:
        return "These passages from the Harry Potter books provide insight into the themes, characters, and plot elements that make the series so compelling. The text demonstrates how each detail contributes to the rich tapestry of the story."


In [8]:
# Format structured response
def format_structured_response(query, matches, templates):
    response = templates["structured"]["header"].format(query=query)
    
    # Add each match with context
    for i, match in enumerate(matches):
        context_text = "\n".join([
            (">>> " + line + " <<<") if j == match['best_line_idx'] else line
            for j, line in enumerate(match['context'])
        ])
        
        response += templates["structured"]["match"].format(
            index=i+1,
            chapter=match['chapter'],
            score=f"{match['score']*100:.1f}",
            context=context_text
        )
    # Add analysis section
    analysis = generate_analysis(query, matches)
    response += templates["structured"]["analysis"].format(analysis=analysis)
    
    return response

In [None]:
# Summarize context for better responses
def summarize_context(context_lines):
    # Join context lines
    text = " ".join(context_lines)
    # Clean up text
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [10]:
# Format freeform response
def format_freeform_response(query, matches, templates):
    query_lower = query.lower()
    
    # Determine query category
    if re.search(r'\b(who|character|person)\b', query_lower):
        category = "character"
        # Try to extract character name
        character_match = re.search(r'\b(who|what|how|tell me about)\s+(is|was|are|were)?\s*([a-z\s]+)', query_lower)
        character = character_match.group(3).strip() if character_match else "this character"
        
        template = random.choice(templates["freeform"]["character"])
        context = summarize_context(matches[0]['context'])
        return template.format(character=character, context=context)
        
    elif re.search(r'\b(what happened|event|scene|when)\b', query_lower):
        category = "event"
        template = random.choice(templates["freeform"]["event"])
        context = summarize_context(matches[0]['context'])
        return template.format(context=context)
        
    elif re.search(r'\b(where|location|place|room|castle|house)\b', query_lower):
        category = "place"
        place_match = re.search(r'\b(where|what|tell me about)\s+(is|was|are|were)?\s*([a-z\s]+)', query_lower)
        place = place_match.group(3).strip() if place_match else "this place" 
        
        template = random.choice(templates["freeform"]["place"])
        context = summarize_context(matches[0]['context'])
        return template.format(place=place, context=context)
        
    else:
        # General response
        template = random.choice(templates["freeform"]["general"])
        context = summarize_context(matches[0]['context'])
        return template.format(context=context)

In [11]:
# Format responses based on mode
def format_response(query, matches, response_mode, templates):
    if not matches:
        if response_mode == "freeform":
            return random.choice(templates["freeform"]["not_found"])
        else:
            return templates["structured"]["not_found"]
    
    if response_mode == "freeform":
        return format_freeform_response(query, matches, templates)
    else:
        return format_structured_response(query, matches, templates)

In [None]:
# Main chatbot class
class StorybookAI:
    def __init__(self):
        self.response_mode = "freeform"  # Default mode
        self.templates = load_response_templates()
        
    def set_response_mode(self, mode):
        if mode.lower() in ["freeform", "structured"]:
            self.response_mode = mode.lower()
            return f"Response mode set to: {self.response_mode}"
        else:
            return "Invalid mode. Please choose 'freeform' or 'structured'."
    
    def answer(self, query):
        matches = find_best_matches(query, self.encoded_chapters, self.original_lines, self.model)
        return format_response(query, matches, self.response_mode, self.templates)

In [16]:
chapters, original_lines = read_text_files_from_directory("chapters")
model = SentenceTransformer('all-MiniLM-L6-v2')
encoded_chapters = encode_chapters(model,chapters)

In [21]:
# Main function
def main():
    print("Initializing Storybook AI for Harry Potter...")
    chatbot = StorybookAI()
    
    print("\n🧙‍♂️ Welcome to Storybook AI: Harry Potter Edition! 🧙‍♂️")
    print("Ask me anything about the first four Harry Potter books.")
    print("Type 'exit' to quit or 'mode freeform/structured' to change response format.")
    
    while True:
        user_input = input("\nWhat would you like to know? \n").strip()
        
        if user_input.lower() == 'exit':
            print("Thank you for using Storybook AI. Mischief managed!")
            break
            
        if user_input.lower().startswith('mode '):
            mode = user_input.split(' ', 1)[1]
            print(chatbot.set_response_mode(mode))
            continue
            
        response = chatbot.answer(user_input)
        print("\n" + response)

if __name__ == "__main__":
   
    main()

Initializing Storybook AI for Harry Potter...

🧙‍♂️ Welcome to Storybook AI: Harry Potter Edition! 🧙‍♂️
Ask me anything about the first four Harry Potter books.
Type 'exit' to quit or 'mode freeform/structured' to change response format.

The books portray wand as his head. As he measured, he said, "Every Ollivander wand has a core of a powerful magical substance, Mr. Potter. We use unicorn hairs, phoenix tail feathers, and the heartstrings of dragons. No two Ollivander wands are the same, just as no two unicorns, dragons, or phoenixes are quite the same. And of course, you will never get such good results with
Thank you for using Storybook AI. Mischief managed!
