# LLM Tutor

A tool that takes a technical question, and responds with an explanation. I will build upon it in the future

In [15]:
# imports
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import ollama
import os
from notion_client import Client as NotionClient

In [16]:
# constants and setup
load_dotenv(override=True)

MODEL_GPT = 'gpt-4o-mini'
MODEL_LLAMA = 'llama3.2'
CLIENT = OpenAI()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
NOTION_API_KEY = os.getenv('NOTION_API_KEY')
NOTION_PAGE_ID = os.getenv('NOTION_PAGE_ID')


In [19]:
# check OpenAI API key format
if OPENAI_API_KEY and OPENAI_API_KEY.startswith('sk-proj-') and len(OPENAI_API_KEY)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
# check Notion API key format
if NOTION_API_KEY and NOTION_API_KEY.startswith('ntn_') and len(NOTION_API_KEY)>10:
    print("Notion API key looks good so far")
else:
    print("Notion API key not found or invalid format")

# check Notion Page ID format (8-4-4-4-12 characters)
if NOTION_PAGE_ID and \
    len(NOTION_PAGE_ID) == (32+4) and \
    all(len(part) == expected for part, expected in zip(NOTION_PAGE_ID.split('-'), [8, 4, 4, 4, 12])):
    print("Notion Page ID looks good so far")
else:
    print("Invalid Notion Page ID format. It should follow the pattern: 8-4-4-4-12")

API key looks good so far
Notion API key not found or invalid format
Notion Page ID looks good so far


In [None]:
# Set up Notion connection

# Initialize the client
notion = NotionClient(auth=NOTION_API_KEY)

# Read a page
page = notion.pages.retrieve(page_id=NOTION_PAGE_ID)

In [None]:
# Extract content from Notion page
def extract_notion_content(page):
    """
    Extract text content from Notion page blocks
    """
    page_id = page['id']
    
    # Get all blocks from the page
    blocks_response = notion.blocks.children.list(block_id=page_id)
    blocks = blocks_response.get('results', [])
    
    content_parts = []
    
    for block in blocks:
        block_type = block.get('type')
        
        if block_type == 'paragraph':
            text_content = extract_text_from_rich_text(block.get('paragraph', {}).get('rich_text', []))
            if text_content.strip():
                content_parts.append(text_content)
                
        elif block_type == 'heading_1':
            text_content = extract_text_from_rich_text(block.get('heading_1', {}).get('rich_text', []))
            if text_content.strip():
                content_parts.append(f"# {text_content}")
                
        elif block_type == 'heading_2':
            text_content = extract_text_from_rich_text(block.get('heading_2', {}).get('rich_text', []))
            if text_content.strip():
                content_parts.append(f"## {text_content}")
                
        elif block_type == 'heading_3':
            text_content = extract_text_from_rich_text(block.get('heading_3', {}).get('rich_text', []))
            if text_content.strip():
                content_parts.append(f"### {text_content}")
                
        elif block_type == 'bulleted_list_item':
            text_content = extract_text_from_rich_text(block.get('bulleted_list_item', {}).get('rich_text', []))
            if text_content.strip():
                content_parts.append(f"• {text_content}")
                
        elif block_type == 'numbered_list_item':
            text_content = extract_text_from_rich_text(block.get('numbered_list_item', {}).get('rich_text', []))
            if text_content.strip():
                content_parts.append(f"1. {text_content}")
                
        elif block_type == 'code':
            code_content = extract_text_from_rich_text(block.get('code', {}).get('rich_text', []))
            language = block.get('code', {}).get('language', 'text')
            if code_content.strip():
                content_parts.append(f"```{language}\n{code_content}\n```")
    
    return "\n\n".join(content_parts)

def extract_text_from_rich_text(rich_text_array):
    """
    Extract plain text from Notion rich text array
    """
    return "".join([item.get('plain_text', '') for item in rich_text_array])

# Extract the content
try:
    notion_content = extract_notion_content(page)
    print(f"✅ Extracted {len(notion_content)} characters from Notion page")
    print(f"Preview: {notion_content[:200]}...")
except Exception as e:
    print(f"❌ Error extracting content: {e}")
    notion_content = ""

In [10]:
# example question:
# Please explain what this code does and why:
# yield from {book.get("author") for book in books if book.get("author")}
question = input("Please enter your question:")

In [None]:
# Create enhanced system prompt with Notion content
base_system_prompt = """You are a helpful technical tutor who answers questions about python code, software engineering, data science and LLMs.

You have access to additional reference material that may be relevant to the user's questions. Use this information to provide more comprehensive and accurate answers when applicable.

REFERENCE MATERIAL:
"""

# Combine base prompt with Notion content
if notion_content:
    system_prompt = base_system_prompt + f"\n{notion_content}\n\n" + """
INSTRUCTIONS:
- Use the reference material above when it's relevant to the user's question
- Always prioritize accuracy and clarity in your explanations
- If the reference material contains relevant information, mention that you're drawing from additional resources
- If the question is outside the scope of the reference material, answer based on your general knowledge
"""
else:
    system_prompt = "You are a helpful technical tutor who answers questions about python code, software engineering, data science and LLMs"

user_prompt = "Please give a detailed explanation to the following question: " + question

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

# Show system prompt length for debugging
print(f"System prompt length: {len(system_prompt)} characters")
print(f"Contains Notion content: {'Yes' if notion_content else 'No'}")

In [17]:
# Get gpt-4o-mini to answer, with streaming
stream = CLIENT.chat.completions.create(
    model=MODEL_GPT,
    messages=messages,
    stream=True
)
response = ""
display_handle = display(Markdown(""), display_id=True)
for chunk in stream:
    response += chunk.choices[0].delta.content or ''
    response = response.replace("```","").replace("markdown", "")
    update_display(Markdown(response), display_id=display_handle.display_id)

The line of code you've provided uses a Python feature called "generator expressions" combined with the `yield from` statement and a set comprehension. Let's break down the code step-by-step:

### Code Explanation

python
yield from {book.get("author") for book in books if book.get("author")}


1. **Context of `yield` and `yield from`:**
   - The keyword `yield` in Python is used within a function to make that function a generator. When a generator function is called, it does not execute its body immediately; instead, it returns a generator object that can be iterated over.
   - `yield from <iterator>` is used to delegate part of the generator’s operations to another generator or iterable. This means that the items from the iterable (or another generator) are yielded to the caller in a simple manner.

2. **Set Comprehension:**
   - The expression `{book.get("author") for book in books if book.get("author")}` is a **set comprehension**. This creates a new set, which is a collection of unique elements.
   - In this case, `books` is presumed to be an iterable (like a list or a set) containing `book` dictionaries.
   - For each `book` in `books`, `book.get("author")` is called. This method retrieves the value associated with the key `"author"` in the dictionary. If the key does not exist, it returns `None`.
   - The condition `if book.get("author")` filters the books to include only those where the author exists (i.e., not `None` or an empty string).

3. **Overall Functionality:**
   - This line of code creates a set of unique authors from the `books` collection, where the `author` key is present and non-null.
   - The `yield from` part then yields each unique author from this set to the caller of the generator function. 

### Why Use This Code:

1. **Efficiency of Sets:**
   - By using a set comprehension (`{}`), this code ensures that only unique authors are yielded. The nature of sets in Python automatically handles duplicate values, making it an efficient way to filter unique results.

2. **Generator Functionality:**
   - Using `yield from` allows the function to produce values one at a time in a memory-efficient manner. Instead of creating a list of unique authors and returning it all at once, which would consume memory proportional to the size of the list, this approach yields authors one by one as they are requested.

3. **Readable and Concise:**
   - The use of comprehensions (`set` and list comprehensions) makes the code more concise and readable, allowing us to express the logic of extracting authors succinctly.

### Example Scenario

Let’s say you have a list of book dictionaries like below:

python
books = [
    {"title": "Book A", "author": "Author 1"},
    {"title": "Book B", "author": "Author 2"},
    {"title": "Book C"},
    {"title": "Book D", "author": "Author 1"},
]


Using the provided code, the yield would result in an iteration of authors like so:

- "Author 1" (from the first and the fourth book)
- "Author 2" (from the second book)

The author `"Book C"` would not be included since it does not have an author specified.

### Conclusion

This line of code is a concise and efficient way to extract and yield unique authors from a collection of books while utilizing Python's features to maintain memory efficiency and code readability. It is an excellent demonstration of set comprehensions and generator functionality in Python.

In [13]:
# Get Llama 3.2 to answer
response = ollama.chat(model=MODEL_LLAMA, messages=messages)
reply = response['message']['content']
display(Markdown(reply))

**Code Explanation**

The given code snippet is written in Python and utilizes several advanced concepts, including generators, iterators, and dictionary comprehensions. Let's break it down:

```python
yield from {book.get("author") for book in books if book.get("author")}
```

Here's a step-by-step explanation:

1. `yield from`: This keyword is used to delegate the execution of the enclosing generator function to the generators contained within it.

2. `{... for ...}`: This is a dictionary comprehension, which creates a new dictionary containing key-value pairs generated by the code inside the comprehension.

3. `book.get("author")`: For each book in the `books` collection, this expression attempts to retrieve the "author" value associated with that book.

4. `for book in books if book.get("author")`: This is a conditional iterator, which only includes books in the iteration where the "author" key exists within the book dictionary.

5. The entire comprehension is wrapped in `yield from`, which means it will yield each value (i.e., the author names) one at a time as the generator is iterated over.

**Why It's Written This Way**

This code snippet is likely used to extract and yield all unique author names from a collection of books, while filtering out any books without an "author" key. 

Using `yield from` has several advantages:

- **Lazy Evaluation**: The expression is only evaluated when its value is actually needed. Since it's wrapped in a generator function, the entire comprehension can be suspended and resumed at will.

- **Efficiency**: Without `yield from`, the entire list of author names would need to be computed all at once. This approach reduces memory usage by processing only the necessary books and delaying computation until each value is required.

Here's an equivalent but more verbose version using a for loop:

```python
def get_authors(books):
    authors = set()
    for book in books:
        if "author" in book:
            authors.add(book["author"])
    yield from authors
```

This can be seen as less efficient due to the need for a set data structure and explicit loops, but it conveys the same intent. The original code snippet is generally more concise and Pythonic because of its use of `yield from` and dictionary comprehension.