
# Database Testing and Tutorial Self-Walkthrough

YT Video Start: https://youtu.be/N1FM-PcVXNA?si=LTwvfIozUsS6clqF&t=185

In [12]:
!pip install -U langchain langgraph cassio langchain_community tiktoken langchain-groq langchainhub chromadb langgraph langchain_huggingface arxiv wikipedia langchain

Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting beautifulsoup4 (from wikipedia)
  Using cached beautifulsoup4-4.13.3-py3-none-any.whl.metadata (3.8 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting soupsieve>1.2 (from beautifulsoup4->wikipedia)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-

### Establishing a token with connections

Note that I am just saving my tokens in a .json file so it can be `.gitignore`'d.

You can get your token and make a database here: https://astra.datastax.com/

In [None]:
import cassio
import json

# Save tokens to file
# tokens = {
#   "ASTRADB_TOKEN": "xyz",
#   "TEST_DB_ID": "abc"
# }
# with open("tokens.json", "w") as f:
#   json.dump(tokens, f)

# Retreive the tokens from file
with open("tokens.json", "r") as f:
  tokens = json.load(f)
  ASTRADB_TOKEN = tokens["ASTRADB_TOKEN"]
  TEST_DB_ID = tokens["TEST_DB_ID"]

# Initialize with the connection
cassio.init(token=ASTRADB_TOKEN, database_id=TEST_DB_ID)


## Building Index

The following will create a split documents based on urls in food_urls.json. The result is a list of documents as a variable "docs_split"

In [39]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
import langchain_core

# Documents to add to index (SHOULD ONLY RUN ONCE)
with open("food_urls.json", "r") as f:
  urls = json.load(f)

print(urls)

def chunkify_text_from_urls(url_list: list[str], chunk_size:int = 500) -> list[langchain_core.documents.base.Document]:
  ## Load the URL's into a document page
  docs = [WebBaseLoader(url).load() for url in url_list]
  doc_list = [item for sublist in docs for item in sublist]
  # Split into multiple text
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=0)
  texts = text_splitter.split_documents(doc_list)
  return texts


docs_split = chunkify_text_from_urls(urls, chunk_size=500) 
docs_split # Output the list of split documents


['https://www.thekitchensnob.com/chocolate-lasagna/', 'https://www.modernhoney.com/fettuccine-alfredo/', 'https://momsdish.com/simple-yakisoba-noodles-recipe', 'https://www.onceuponachef.com/recipes/how-to-cook-steak-on-the-stovetop.html', 'https://www.taste.com.au/recipes/easy-fried-rice-3/5c89aa24-1de3-4ddc-9362-e68dfc489dc1?r=quickeasy/biccuul7']


[Document(metadata={'source': 'https://www.thekitchensnob.com/chocolate-lasagna/', 'title': 'Chocolate Lasagna ~ The Kitchen Snob', 'description': 'This chocolate lasagna is a no-baked dessert layered with an oreo crust, cream cheese, chocolate pudding and whipped cream. One of my most popular recipes!', 'language': 'en-US'}, page_content='Chocolate Lasagna ~ The Kitchen Snob\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n \nCloseMobile MenuSearch this website\nHome\nRecipes\n\nAppetizers\nBreakfast\nMain Dishes\nSide Dishes\nVegetarian\nDesserts\n\n\nTips & Tricks\nSugar Free\nLife\nAbout\n\n Menu Skip to right header navigation Skip to primary navigation Skip to main content Skip to primary sidebarMeet The Kitchen Snob\nHome\nRecipes\n\nAppetizers\nBreakfast\nMain Dishes\nSide Dishes\nVegetarian\nDesserts\n\n\nTips & Tricks\nSugar Free\nLife\nAbout\nHome » Recipes » Desserts 

## Converting the split documents into vectors

The following converts the split documents into vectors by using an embedding system. This is a sort of ML modeled system that interprets the data into something that can be utilized more cohesively.

To explain in layman's: The documents need to be converted into vectorized data a computer can read and use. We are humans, and so these ML models will re-interpret the list of website HTML (as seen above) into more approachable data a computer can handle

To get a token from HuggingFace: https://huggingface.co/settings/tokens

In [44]:
from langchain_huggingface import HuggingFaceEmbeddings
import os

with open("tokens.json", "r") as f:
  tokens = json.load(f)

  # We must explicitly define the token into the environment variable
  os.environ["HUGGINGFACE_HUB_TOKEN"] = tokens["HUGGING_FACE_READ_TOKEN"]

# Use an open source embedding model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [49]:
from langchain.vectorstores.cassandra import Cassandra

# Connect to the database with the designated embeddings
astra_vector_store = Cassandra(embedding=embeddings,
                               table_name="food_demo",
                               session=None,
                               keyspace=None)

In [50]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

# Adding documents to the vector store
astra_vector_store.add_documents(docs_split)
print(f"Inserted {len(docs_split)} headlines.")

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 51 headlines.


## Retreiving data

This allows you to retreive data from the document (Could be a separate program to read the vectorized database)

In [51]:
retreiver = astra_vector_store.as_retriever()
retreiver.invoke("What is fettucini Alfredo?")

[Document(id='ed50d4b1f6d34e26a24fd93ed92c4fb9', metadata={'description': 'The best fettuccine alfredo. Fettuccine pasta with a homemade decadent alfredo sauce made with 5 simple ingredients. Quick and easy fettuccine alfredo.', 'language': 'en-US', 'source': 'https://www.modernhoney.com/fettuccine-alfredo/', 'title': 'Fettuccine Alfredo – Modern Honey'}, page_content='Fettuccine Alfredo – Modern Honey\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \nSkip to contentSign up for free daily recipes! →Recipes\n\nDinner\nDessert\nSide Dishes\nAppetizers\nBreakfast\nAll Categories\n\n\nTop 20\nDessert\nSubscribe\nAbout\nSearch\n\nHome  Pasta  Fettuccine AlfredoFettuccine AlfredoPosted by Melissa Stadleron  Aug 12, 2018286 CommentsJump to RecipeRate RecipeThis post may contain affiliate links. Please read our disclosure policy.\n\nFettuccine Alfredo\nFettuccine pasta topped with a homemade rich decadent alfre

## Langgraph Application

Groq allows chatting with a large number of AI Chatbots like LLama.

To use groq's API, create an account -> Start Building -> API Keys
And create an API Key: https://console.groq.com/keys

In [None]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# This RouteQuery class basically gives priority to the vectorstore, otherwise do wiki search if it doesnt work
class RouteQuery(BaseModel):
  """This routes a user query to the most relevent data source"""

  datasource: Literal["vectorstore", "wiki_search"] = Field(
    ...,
    description="Gives a user question choose to route it to wikipedia or a vectorstore"
  )

In [56]:
from langchain_groq import ChatGroq
import os

# Accessing groq
with open("tokens.json", "r") as f:
  tokens = json.load(f)

  GROQ_API_KEY = tokens["GROQ_API_KEY"]

# List of models: https://console.groq.com/docs/models
llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="llama-3.3-70b-versatile")
llm

# Add structured output RouteQuery
structured_llm_router = llm.with_structured_output(RouteQuery)


In [57]:
# Prompt
system = """You are an expert at routing a user question to a vectorstore or wikipedia. 
The vectorstore contains documents related to food and only retrieival of their recipes.
 Use the vectorstore for questions on this one topic. Otherwise, use wiki-search."""

route_prompt = ChatPromptTemplate.from_messages(
  [
    ("system", system),
    ("human", "{question}")
  ]
)

question_router = route_prompt | structured_llm_router

## Testing query with a prompt

In [58]:
print(question_router.invoke(
  {
    "question": "How do I make fettucini?"
  }
))

datasource='vectorstore'


## Wikipedia search capabilities

In [72]:
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.tools import WikipediaQueryRun

# top_k_results returns the top n results
# doc_content_chars_max allows you to set the max number of characteers (-1 to disable)
api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=-1)
wiki = WikipediaQueryRun(api_wrapper=api_wrapper)

In [73]:
wiki.run("How do I make fettucini?")

'Page: Fettuccine Alfredo\nSummary: Fettuccine Alfredo (Italian: [fettut\'tʃiːne alˈfreːdo]) is a pasta dish consisting of fettuccine tossed with butter and Parmesan cheese, which melt and emulsify to form a rich cheese sauce coating the pasta. Originating in Rome in the early 20th century, the recipe is now popular in the United States and other countries. Outside of Italy, cream is sometimes used to thicken the sauce, and ingredients such as chicken, shrimp, salmon or broccoli may also be added when it is served as a main course.\nThe dish is named after Alfredo Di Lelio, a Roman restaurateur who is credited with its invention and popularisation. Di Lelio\'s tableside service was an integral part of the recipe\'s success. Fettuccine Alfredo is a variant of standard Italian fettuccine al burro (\'fettuccine with butter\') or pasta burro e parmigiano (\'pasta with butter and Parmesan cheese\'). It is a kind of pasta in bianco, that is, without added sauce. Italian recipes do not includ

## AiGents Application Using Langgraph

In [76]:
from typing import List
from typing_extensions import TypedDict

class GraphState(TypedDict):
  """
  Represents the state of the graph

  Attributes:
    question: question
    generation: LLM generation
    documents: list of documents
  """

  question: str
  generation: str
  documents: List[str]

In [80]:
from langchain.schema import Document

def retrieve(state):
  """
  Retrieve documents

  Args:
    state (dict): The current graph state

  Returns:
    state (dict): New key added to state, documents, that contains retreived documents
  """

  print("---Retrievee---")
  question=state["question"]

  # Retreival
  documents=retreiver.invoke(question)
  return {"documents":documents, "questions":question}

In [84]:
def wiki_search(state):
  """
  Wiki search based on the re-phrased question

  Args:
    state (dict): The current graph state

  Returns:
    state (dict): Updates documents key with appended web results
  """
  print("---Wikipedia---")
  question=state["question"]
  print(question)

  # Wiki search
  docs = wiki.invoke({"query": question})

  wiki_results = docs
  wiki_results = Document(page_content=wiki_results)

  return {"documents": wiki_results, "question": question}



## Routing the functions to the LangGraph `question_router` from earlier

In [85]:
def route_question(state):
  """
  Routes the question to wiki search or RAG.

  Args:
    state (dict): The current graph state

  Returns:
    str: Next node to call
  """
  print("---Route Question---")
  question = state["question"]
  source = question_router.invoke({"question": question})
  
  data_source = source.datasource
  data_sources = ["wiki_search", "vectorstore"]
  
  if data_source in data_sources:
    print(f"Routing to {data_source}")
    return data_source
  



In [86]:
from langgraph.graph import END, StateGraph, START

workflow = StateGraph(GraphState)
# Define the nodes
workflow.add_node("wiki_search", wiki_search)
workflow.add_node("retrieve", retrieve)

# Build the graph
workflow.add_conditional_edges(
  START,
  route_question,
  {
    "wiki_search": "wiki_search",
    "vectorstore": "retrieve"
  }
)

workflow.add_edge("retrieve", END)
workflow.add_edge("wiki_search", END)

# Compile
app=workflow.compile()


In [None]:
from IPython.display import Image, display

try:
  display(Image(app.get_graph().draw_mermaid_png()))
except Exception:
  # This requires some additional dependancies and is optional
  pass