### STREAMING: System Prompting w ChatPromptTemplate

In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import StrOutputParser

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

chat_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template("{system_message}"),
    HumanMessagePromptTemplate.from_template("Hi, what's your name?"),
    AIMessagePromptTemplate.from_template("My name is ArX."),
    HumanMessagePromptTemplate.from_template("{user_message}"),
])

# chat_template.invoke({"system_message": "I'm god", "user_message": "is this real life?"})

# Create a ChatOpenAI model
model = ChatOpenAI(
    verbose = False,
    model = "gpt-3.5-turbo-1106",
    temperature = 0.618,
    max_retries = 2,
    streaming = True,
    max_tokens = 1000,
    # model_kwargs={"stop": ["\n"]}
                #   "output_format": "json"}
)

# Define the chain
chain = (
    chat_template
    | model
    | StrOutputParser()
)

# Example of iterating through the list and using each pair
system_message = "you are elon musk"
user_message = "how would you bulid a cognition inspired AI system?"

prompt = {
"system_message": system_message, 
"user_message": user_message,
}


# Print or process the response as needed
print(f"\n\nSystem Prompt:\n{system_message}\nUser Prompt:\n{user_message}\nResponse:")

for token in chain.stream(prompt):
    print(token, end="") 

# print(chain.invoke(prompt))


### Tools Usage

In [None]:
from langchain.agents import tool


@tool
def M_O_L(word: str) -> int:
    """finds the meaning of life"""
    return 42


tools = [M_O_L]

from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant.",
        ),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

from langchain_core.utils.function_calling import convert_to_openai_function

llm_with_tools = llm.bind(functions=[convert_to_openai_function(t) for t in tools])

from langchain.agents.format_scratchpad import format_to_openai_function_messages
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser

agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_function_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIFunctionsAgentOutputParser()
)


# print("raw = " + llm.invoke("whats the meaning of life?").content)

from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)

print("chain = " + agent_executor.invoke({"input": "whats the meaning of life?"})["output"])


### WIKIPEDIA Agent

In [None]:
from operator import itemgetter

from langchain.agents import AgentExecutor, load_tools
from langchain.agents.format_scratchpad import format_to_openai_function_messages
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_core.prompt_values import ChatPromptValue
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI

def condense_prompt(prompt: ChatPromptValue) -> ChatPromptValue:
    messages = prompt.to_messages()
    num_tokens = llm.get_num_tokens_from_messages(messages)
    ai_function_messages = messages[2:]
    while num_tokens > 4_000:
        ai_function_messages = ai_function_messages[2:]
        num_tokens = llm.get_num_tokens_from_messages(
            messages[:2] + ai_function_messages
        )
    messages = messages[:2] + ai_function_messages
    return ChatPromptValue(messages=messages)


wiki = WikipediaQueryRun(
    api_wrapper=WikipediaAPIWrapper(top_k_results=5, doc_content_chars_max=10_000)
)
tools = [wiki]

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant"),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)
llm = ChatOpenAI(model="gpt-3.5-turbo")

agent = (
    {
        "input": itemgetter("input"),
        "agent_scratchpad": lambda x: format_to_openai_function_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | condense_prompt
    | llm.bind_functions(tools)
    | OpenAIFunctionsAgentOutputParser()
)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
response= agent_executor.invoke(
    {
        "input": "what the art of driving a car?",
    }
)
print(response["output"])

### MEMORY stuff

---
#### Buffer Window
https://python.langchain.com/docs/modules/memory/types/buffer_window

### BUFFER full Example

In [21]:
from operator import itemgetter

from langchain.memory import ConversationBufferWindowMemory, ConversationTokenBufferMemory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo")
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful chatbot"),
        MessagesPlaceholder(variable_name="HIS"),
        ("human", "{input}"),
    ]
)

# sytem = when it doesnt know from input, takes context from here, + identity
# human = input from user

memory = ConversationTokenBufferMemory(llm=llm, memory_key="HIS", return_messages=True, max_token_limit=2500)
# memory = ConversationBufferMemory(return_messages=True)

chain = (
    RunnablePassthrough.assign(HIS=RunnableLambda(memory.load_memory_variables) | itemgetter("HIS"))
    | prompt
    | llm
)

while True:
    user_input = input("You: ")
    if user_input == "q":
        break

    inputs = {"input": user_input}
    response = chain.invoke(inputs).content
    
    current_memory = memory.load_memory_variables({})
    if current_memory["HIS"]:  # Check if there is existing conversation history
        # Extract the last AI message as the new input
        last_ai_message = current_memory["HIS"][-1].content
        historic = {"input": last_ai_message}
    else:
        historic = inputs
        
    # print(response)
    memory.save_context(historic, {"output": response})
    
    mem = memory.load_memory_variables({})
    print(response)
    print(mem)



Hello! How can I assist you today?
{'HIS': [HumanMessage(content='tes'), AIMessage(content='Hello! How can I assist you today?')]}
How can I help you today?
{'HIS': [HumanMessage(content='tes'), AIMessage(content='Hello! How can I assist you today?'), HumanMessage(content='Hello! How can I assist you today?'), AIMessage(content='How can I help you today?')]}
If you have any questions or need assistance, feel free to ask!
{'HIS': [HumanMessage(content='tes'), AIMessage(content='Hello! How can I assist you today?'), HumanMessage(content='Hello! How can I assist you today?'), AIMessage(content='How can I help you today?'), HumanMessage(content='How can I help you today?'), AIMessage(content='If you have any questions or need assistance, feel free to ask!')]}
"His" is a pronoun typically used to refer to a male person or animal. It is possessive form of "he." For example, "His name is John" means that the person being referred to is named John.
{'HIS': [HumanMessage(content='tes'), AIMessa

### ENTITY

In [None]:
from langchain_openai import OpenAI
from langchain.memory import ConversationEntityMemory
llm = OpenAI(temperature=0)

memory = ConversationEntityMemory(llm=llm)
input = {"input": "Deven & Sam are working on a hackathon project"}
memory.load_memory_variables(input)
memory.save_context(
    input,
    {"output": " That sounds like a great project! What kind of project are they working on?"}
)

memory.load_memory_variables({"input": 'who is Sam'})

In [None]:
from langchain.chains import ConversationChain
from langchain.memory import ConversationEntityMemory
from langchain.memory.prompt import ENTITY_MEMORY_CONVERSATION_TEMPLATE
from pydantic import BaseModel
from typing import List, Dict, Any


conversation = ConversationChain(
    llm=llm,
    verbose=True,
    prompt=ENTITY_MEMORY_CONVERSATION_TEMPLATE,
    memory=ConversationEntityMemory(llm=llm)
)

conversation.predict(input="Deven & Sam are working on a hackathon project")

### MEM Knowledge Graph

In [None]:
from langchain.memory import ConversationKGMemory
from langchain_openai import OpenAI

llm = OpenAI(temperature=0)
memory = ConversationKGMemory(llm=llm)
memory.save_context({"input": "say hi to sam"}, {"output": "who is sam"})
memory.save_context({"input": "sam is a friend"}, {"output": "okay"})

memory.load_memory_variables({"input": "sam"})

# memory.load_memory_variables({"input": "i like flowers"})

In [None]:
llm = OpenAI(temperature=0)
from langchain.chains import ConversationChain
from langchain.prompts.prompt import PromptTemplate

template = """The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. 
If the AI does not know the answer to a question, it truthfully says it does not know. The AI ONLY uses information contained in the "Relevant Information" section and does not hallucinate.

Relevant Information:

{history}

Conversation:
Human: {input}
AI:"""
prompt = PromptTemplate(input_variables=["history", "input"], template=template)
conversation_with_kg = ConversationChain(
    llm=llm, verbose=True, prompt=prompt, memory=ConversationKGMemory(llm=llm)
)

### MEM Summary - Not useful by itself

In [None]:
from langchain.memory import ConversationSummaryMemory, ChatMessageHistory
from langchain_openai import OpenAI

memory = ConversationSummaryMemory(llm=OpenAI(temperature=0))
memory.save_context({"input": "Im bob"}, {"output": "whats up bob"})
memory.save_context({"input": "what do you like?"}, {"output": "I like cheese"})

memory.load_memory_variables({})

### MEM Buffer SUMMARY - Best to use in practice

In [None]:
from langchain.memory import ConversationSummaryBufferMemory
from langchain_openai import OpenAI

llm = OpenAI()

memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=10)
memory.save_context({"input": "hi"}, {"output": "whats up"})
memory.save_context({"input": "not much you"}, {"output": "not much"})

memory.load_memory_variables({})

#### Useful for passing back to a chat model

In [None]:
memory = ConversationSummaryBufferMemory(
    llm=llm, max_token_limit=10, return_messages=True
)
memory.save_context({"input": "hi"}, {"output": "whats up"})
memory.save_context({"input": "not much you"}, {"output": "not much"})

memory.load_memory_variables({})

In [None]:
messages = memory.chat_memory.messages
previous_summary = ""
memory.predict_new_summary(messages, previous_summary)

In [None]:
from langchain.chains import ConversationChain

conversation_with_summary = ConversationChain(
    llm=llm,
    # We set a very low max_token_limit for the purposes of testing.
    memory=ConversationSummaryBufferMemory(llm=OpenAI(), max_token_limit=40),
    verbose=True,
)
conversation_with_summary.predict(input="Hi, what's up?")
conversation_with_summary.predict(input="Just working on writing some documentation!")
conversation_with_summary.predict(input="For LangChain! Have you heard of it?")
# We can see here that the summary and the buffer are updated
conversation_with_summary.predict(
    input="Haha nope, although a lot of people confuse it for that"
)

### RAG , LOADERS, etc TESTs

YT transcript loader
https://python.langchain.com/docs/integrations/document_loaders/youtube_transcript

In [57]:
import re
def parse_web_text(text):
    # Normalize space and collapse multiple newlines to two, preserving paragraph breaks
    cleaned_text = re.sub(r'\s+', ' ', text)  # Replace one or more whitespace characters with a single space
    cleaned_text = re.sub(r'(\.\s*)', r'\1\n\n', cleaned_text)  # Attempt to preserve paragraph breaks after periods

    # Remove common non-content patterns and phrases
    non_content_patterns = [
        'Privacy Policy', 'Terms of Use', 'Cookie Policy', 'All rights reserved',
        'Follow us on', 'Subscribe', 'Sign up for', 'Menu', 'Navigation',
        'Quick Links', 'Customize', 'Log In', 'Sign Up'
    ]
    
    for pattern in non_content_patterns:
        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)

    # Further clean-up to remove any residual multiple line breaks and trim whitespace
    cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)  # Collapse multiple newlines to two
    cleaned_text = cleaned_text.strip()

    return cleaned_text

In [72]:
from langchain_community.document_loaders import WebBaseLoader
url="https://www.vogue.co.uk/article/best-books-2024"
loader = WebBaseLoader(url, 
                       requests_per_second=2,
                       continue_on_failure=True,
                        verify_ssl=True
                       )
docs= loader.load()
text= parse_web_text(docs[0].page_content)
print(text)


The Best Books Of 2024 So Far | British VogueSkip to main contentOpen  Story SavedTo revisit this article, visit My Profile, then View saved storiesClose AlertStory SavedTo revisit this article, select My Account, then View saved storiesClose AlertSign InSearchSearchFashionBeautyArts & LifestyleRunwayShoppingNewsVideoVogue ShopVogue ClubOpen  Story SavedTo revisit this article, visit My Profile, then View saved storiesClose AlertStory SavedTo revisit this article, select My Account, then View saved storiesClose AlertSign InBooksThe Best Books Of 2024 So FarBy Chloe Schama, Taylor Antrim, Marley Marius, Lisa Wong Macabasco and Chloe Malle21 January 2024FacebookXPinterestSave StoryLina ScheyniusSave this storySaveSave this storySaveWe like to think of this list of the best books of 2024 as the anti-algorithm, a collection of highly specific, highly individual, and somewhat eclectic books that we just absolutely love. 

At a moment when the very act of curation threatens to be overwhelmed

##### TEXT SPLITTER - Semantic vs Character vs Token

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="percentile") #standard_deviation, percentile, interquartile
texts = text_splitter.create_documents([text])
print(len(texts))
for t in texts:
    print(t.page_content, "\n\n\SPLIT----------------\n\n")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=50)
# all_splits = text_splitter.split_text(text)
all_splits = text_splitter.split_documents([text])

print(len(all_splits))
for s in all_splits:
    print(s, "\n\n\SPLIT----------------\n\n")

In [74]:
# This is a long document we can split up.
from langchain_text_splitters import CharacterTextSplitter
from langchain.docstore.document import Document

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, chunk_overlap=50
)

text_doc = Document(page_content=text)

# texts = text_splitter.split_text(text)

# print(text_doc)
texts = text_splitter.split_documents([text_doc])

print(len(texts))
for t in texts:
    print(t.page_content, "\n\n\SPLIT----------------\n\n")

10
The Best Books Of 2024 So Far | British VogueSkip to main contentOpen  Story SavedTo revisit this article, visit My Profile, then View saved storiesClose AlertStory SavedTo revisit this article, select My Account, then View saved storiesClose AlertSign InSearchSearchFashionBeautyArts & LifestyleRunwayShoppingNewsVideoVogue ShopVogue ClubOpen  Story SavedTo revisit this article, visit My Profile, then View saved storiesClose AlertStory SavedTo revisit this article, select My Account, then View saved storiesClose AlertSign InBooksThe Best Books Of 2024 So FarBy Chloe Schama, Taylor Antrim, Marley Marius, Lisa Wong Macabasco and Chloe Malle21 January 2024FacebookXPinterestSave StoryLina ScheyniusSave this storySaveSave this storySaveWe like to think of this list of the best books of 2024 as the anti-algorithm, a collection of highly specific, highly individual, and somewhat eclectic books that we just absolutely love. 

At a moment when the very act of curation threatens to be overwhel

### Summary with history

In [11]:
from operator import itemgetter
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory, ConversationTokenBufferMemory

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# Create a ChatOpenAI model
llm = ChatOpenAI(
    verbose = False,
    model = "gpt-3.5-turbo-1106",
    temperature = 0.0,
    max_retries = 2,
    # streaming = True,
    # max_tokens = 1000,
    # model_kwargs={"stop": ["\n"]}
                #   "output_format": "json"}
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You summarise expertly."),
        MessagesPlaceholder(variable_name="history"),
        ("human", "Ignore info about cookie policies and anything else of the sort. Always capture the essense what the page is about. Always perfectly maintain quotes reviews and citations without changing them. Summarise this text: \n{input}"),
    ]
)

# memory = ConversationBufferMemory(return_messages=True)
memory = ConversationTokenBufferMemory(llm=model, max_token_limit=4000, return_messages=True)
# memory = ConversationSummaryBufferMemory(llm=model, max_token_limit=3000, return_messages=True)
# memory.load_memory_variables({})

chain = (
    RunnablePassthrough.assign(
        history=RunnableLambda(memory.load_memory_variables) | itemgetter("history")
    )
    | prompt
    | model
)

for text in texts:
    response = chain.invoke({"input": text.page_content}).content
    print(response)
    memory.save_context({"input": text.page_content}, {"output": response})


The text is about cookie preferences on Amazon's website, allowing users to choose whether to accept or decline the use of cookies for various purposes. It also includes information about a Bonsai Chinese Elm Tree being sold on Amazon, with details about the product, customer ratings, and related products.
The text provides details about a Bonsai Chinese Elm Tree being sold on Amazon, including information about the product, customer ratings, and related products. It also includes customer reviews and feedback about the product's appearance, delivery, health, and durability. Additionally, it offers options for providing feedback on the price and compares the product to others in the same category.
The text includes a customer review about a Bonsai tree, where the customer shares their experience of purchasing a pot for the tree and the unfortunate incident of their previous tree dying. Additionally, there are two more customer reviews expressing dissatisfaction with the product, one me

### Summary with history + RAG + final RAG

In [93]:

from operator import itemgetter
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory, ConversationTokenBufferMemory

from langchain.prompts.chat import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel

from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_core.output_parsers import StrOutputParser
# Create a ChatOpenAI model
model = ChatOpenAI(
    # verbose = False,
    model = "gpt-3.5-turbo-1106",
    temperature = 0.6,
    max_retries = 3,
    # streaming = True,
    # max_tokens = 1000,
    # model_kwargs={"stop": ["\n"]}
                #   "output_format": "json"}
)

# memory = ConversationBufferMemory(return_messages=True)
memory = ConversationTokenBufferMemory(llm=model, max_token_limit=4000, return_messages=True)
# memory = ConversationSummaryBufferMemory(llm=model, max_token_limit=3000, return_messages=True)
# memory.load_memory_variables({})

# # Split
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# all_splits = text_splitter.split_documents(docs)

# Add to vectorDB
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(
    documents=texts,
    embedding=OpenAIEmbeddings(),
)
retriever = vectorstore.as_retriever()


system_message = """
You summarise expertly.
Retrieved context:
---
{context}
"""
user_message = """
Following is web scraped data, but dont mention that or 'text is about' or other unnecessary intros, go straight to the task.
Ignore info about cookie policies, and anything else of the sort. 
Always capture the essense what it is about.
Always perfectly maintain quotes reviews and citations without changing them. 
Summarise the following:
---
{input}
"""



prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_message),
    # MessagesPlaceholder(variable_name="history"),
    # HumanMessagePromptTemplate.from_template("Hi, what's your name?"),
    # AIMessagePromptTemplate.from_template("My name is ArX."),
    HumanMessagePromptTemplate.from_template(user_message),
])

# RAG chain history
rag_chain_history = (
    RunnableParallel({"context": retriever, "input": RunnablePassthrough()})
    | RunnablePassthrough.assign(history=RunnableLambda(memory.load_memory_variables) 
    | itemgetter("history")) 
    | prompt
    | model
    | StrOutputParser()
)

# RAG chain
rag_chain = (
    RunnableParallel({"context": retriever, "input": RunnablePassthrough()})
    | prompt
    | model
    | StrOutputParser()
)


In [94]:
final_summary_response = rag_chain.invoke("")
print("\nRAG Summary:\n", final_summary_response)



RAG Summary:
 The previous conversation was about a list of the best books of 2024, featuring specific and eclectic selections that are highly recommended by the authors.


In [95]:
summaries = []
for text in texts:
    string = text.page_content
    response = rag_chain_history.invoke(string)
    print(response)
    summaries.append(response)
    memory.save_context({"input": string}, {"output": response})

# For illustration, let's concatenate and pass to another summary function
final_summary_input = " ".join(summaries)

final_summary_response = rag_chain.invoke(final_summary_input)
print("\n\Summary of Summary:\n", final_summary_response)

The previous conversation was about the best books of 2024 so far, as featured in British Vogue. 

The passage is a reflection on the list of the best books of 2024, describing it as an "anti-algorithm" collection that is highly specific, individual, and eclectic, providing a counterweight to the overwhelming curation influenced by tracking cookies and offering a reminder of beloved authors, introductions to new works, and unexpected detours.
The previous conversation was about a list of the best books of 2024, and the excerpt you provided is a summary of the novel "Come and Get It" by Kiley Reid, which explores class, money, and relationships on a college campus.
The previous conversation was about books and their descriptions. The given text is a summary of a review of the book "Change" by Édouard Louis, which describes the unsettling nature of the book's content and its exploration of poverty, privilege, and self-invention.
The previous conversation was about book reviews and summar

### RAG full

In [6]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
data = UnstructuredMarkdownLoader("../output/undefined_research.md").load() # mode="elements" to keep metadata
print(data[0].page_content)

RESEARCH for OUTLINE:

6 TOTAL SEARCHES for 2 QUERIES:

'History of iaido'
'Techniques and principles of iaido'

SEARCH QUERY: 'History of iaido'

QUICK SEARCH:

Iaido is a captivating martial art that offers a unique blend of physical and mental discipline. By practicing Iaido, you can connect with the rich history and traditions of Japanese swordsmanship, experience personal growth, and join a supportive community. History of Iaido. In the 1500s, Hayashizaki Jinsuke Shigenobu invented the art of drawing swords quickly, leading to what we now call iaido. He started the Shimmei Muso-ryu school. Not much is known about his exact techniques, but his teachings spread through students. Originating in Japan, Iaido is an age-old martial art form that primarily focuses on the swift and smooth drawing of the sword, followed by an attacking action, and finally the replacement of the sword in the scabbard. While it may seem out of sync in our modern, fast-paced world, Iaido retains its relevance

In [None]:
# from langchain_community.document_loaders import WebBaseLoader
# url = """
# https://www.amazon.co.uk/Designing-Creatures-Characters-Portfolio-Animation/dp/1440344094/ref=sr_1_4?crid=2PSBQBJ2WZ007&dib=eyJ2IjoiMSJ9.Uzw3aM7YRW5z8N8aixKcUpgX6WgHs2An7bzkQEunZfwScGruv7Akg7AdgLFaxhcARgVtC-7Q67dR0S4deXYQferXsKxpc0fiOkBdRubRHnrfckYyL9Jk6J9tf5asDzZwprOL7AhwK_hVAGfJ-s3MmLMj1PSoVWLuFAaoyLcXmxdaVRDTqUQfCj0TIFLHGn3PrFvmLw2GuVPjww-O6XqqQ4AKzn-nUuftrXjH6wlOM98.VSVfmMHtzbTRBLHVihtsJRHXpszN-yJogEZK02OxzOM&dib_tag=se&keywords=concept+art+book&qid=1709572524&sprefix=concept+art+book%2Caps%2C89&sr=8-4
# """
# loader = WebBaseLoader(url)
# docs = loader.load()

# from langchain_community.document_loaders import TextLoader
# text = TextLoader("../../BukGPT/data/buk_all.txt").load()

from langchain_community.document_loaders import UnstructuredMarkdownLoader
data = UnstructuredMarkdownLoader("markdown_path").load() # mode="elements" to keep metadata

# from langchain_community.document_loaders import PyPDFLoader
# # Load PDF document
# loader = PyPDFLoader("../../BukGPT/data/CharlesB2.pdf")
# # pages = loader.load_and_split()
# data = loader.load()

# LLM
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# Create a ChatOpenAI model
llm = ChatOpenAI(
    verbose = False,
    model = "gpt-3.5-turbo-1106",
    temperature = 0.618,
    max_retries = 2,
    streaming = True,
    max_tokens = 1000,
    # model_kwargs={"stop": ["\n"]}
                #   "output_format": "json"}
)

# # Split
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# all_splits = text_splitter.split_documents(docs)

# Add to vectorDB
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(
    documents=texts,
    embedding=OpenAIEmbeddings(),
)
retriever = vectorstore.as_retriever()

from langchain_core.prompts import ChatPromptTemplate
template = """Answer the question based only on the following context:
{context}

Question: {input}
"""
prompt = ChatPromptTemplate.from_template(template)


# RAG chain
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
r_chain = (
    RunnableParallel({"context": retriever, "input": RunnablePassthrough()})
    | prompt
    | model
    | StrOutputParser()
)

print(r_chain.invoke({'input':"please extract the costumer research from this product, including demographics, and the main findings."}))

### SEARCHES

In [None]:
import os
from dotenv import load_dotenv
from exa_py import Exa

# Load the .env file
load_dotenv()

# Read the API key from the environment variable
oai_api_key = os.getenv("OPENAI_API_KEY")
exa_api_key = os.getenv("EXA_API_KEY")

exa = Exa(api_key=exa_api_key)

In [None]:
query = "concept art book"

searches = exa.search(query,
    num_results=10,
    include_domains=["amazon.co.uk", "audible.co.uk"],
    # exclude_domains=["reddit.com"],
    # start_crawl_date = "2021-06-12",
    # end_crawl_date = "2021-06-12",
    # start_published_date="2023-06-12"
    use_autoprompt=True,
    type = 'neural' # 'keyword' or 'neural
)

for search in searches.results:
    print(search.title, search.url)

In [None]:
query = "research succesful book topics to write about"

searches = exa.search_and_contents(query,
    num_results=5,
    # include_domains=["amazon.co.uk", "audible.co.uk"],
    # exclude_domains=["reddit.com"],
    # start_crawl_date = "2021-06-12",
    # end_crawl_date = "2021-06-12",
    # start_published_date="2023-06-12"
    use_autoprompt=True,
    type = 'neural' # 'keyword' or 'neural
)

from langchain_community.document_loaders import WebBaseLoader

for search in searches.results:
    print(search.title)
    print(search.url)
    print(search.text)
    print("\n\n\-------------------\n\n\n")
    url = search.url
    loader = WebBaseLoader(url)
    docs = loader.load()
    print(docs[0].page_content)
    
    # ignore blocked website. ignore web interface. focus on content. extract most meaningful content to the query.

In [22]:
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults
import json 
import re
wrapper = DuckDuckGoSearchAPIWrapper()

search = DuckDuckGoSearchResults(api_wrapper=wrapper, source="web", num_results=8)

data_str= search.run("concept art book")
pattern = r"\[snippet: (.*?), title: (.*?), link: (.*?)\]"
matches = re.findall(pattern, data_str, re.DOTALL)
articles = [{"title": match[1].strip(), "link": match[2].strip(), "snippet": match[0].strip()} for match in matches]

from langchain_community.document_loaders import WebBaseLoader

for article in articles:
    print(json.dumps(article))
    # print(article["link"])
    # loader = WebBaseLoader(article["link"])
    # docs = loader.load()
    # print(docs[0].page_content)

DuckDuckGoSearchException: _aget_url() https://links.duckduckgo.com/d.js DuckDuckGoSearchException: Ratelimit