In [None]:
!pip install langchain openai tiktoken chromadb

In [None]:
import os 
os.environ["OPENAI_API_KEY"] = "Enter your api key"

## 1- Loaders

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("/kaggle/input/cs224u-contextreps/cs224u-contextualreps-2023-handout.pdf")
pages = loader.load()
len(pages)

In [None]:
page = pages[0]
print(page.page_content[:])

## 2- Document Splitting

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 20,
    chunk_overlap = 5
)

character_splitter = CharacterTextSplitter(
    chunk_size = 20,
    chunk_overlap = 5,
)

text = "My name is Ahmed Eldokmak, nice to meet you"

recursive_splitter.split_text(text)

In [None]:
character_splitter.split_text(text)

In [None]:
character_splitter = CharacterTextSplitter(
    chunk_size = 20,
    chunk_overlap = 5,
    separator = ' '
)

character_splitter.split_text(text)

#### The following example from (langchain chat with your data) short course from deeplearning.ai

In [None]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
c_splitter.split_text(some_text)

In [None]:
r_splitter.split_text(some_text)

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

### Loading + splitting our own data

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

loader = PyPDFLoader("/kaggle/input/cs224u-contextreps/cs224u-contextualreps-2023-handout.pdf")
pages = loader.load()

splitter = CharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 150,
    length_function = len,
    separator = "\n"
)
docs = splitter.split_documents(pages)

In [None]:
len(pages)

In [None]:
len(docs)

In [None]:
docs[0]

In [None]:
docs[0].metadata

In [None]:
docs[:5]

### Token splitting

In [None]:
from langchain.text_splitter import TokenTextSplitter

splitter = TokenTextSplitter(
    chunk_size=1, 
    chunk_overlap=0
)
text1 = "fasd fdas err"

splitter.split_text(text1)

### Context Aware splitting
#### The following example is also from (langchain chat with your data) short course from deeplearning.ai

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = headers_to_split_on
)

md_header_splits = markdown_splitter.split_text(markdown_document)

In [None]:
md_header_splits[0]

## 3- Text Embedding






In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()
sentence = "My name is Ahmed Eldokmak, nice to meet you"

embedding1 = embedding.embed_query(sentence)
embedding1

embeddings convert text into vectors with meaning of each word

In [None]:
import numpy as np

sentence1 = "I like NLP"
sentence2 = "I like machine learning"
sentence3 = "I hate lying"

# notice that sentence 1 and 2 have some similarities
# while 3 is different
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

result1 = np.dot(embedding1, embedding2)
result2 = np.dot(embedding2, embedding3)

print(f"dot product of sentence1 and sentence2 = {result1}\ndot product of sentence2 and sentence3 = {result2}")

## 4- Vector stores + Similarity search

In [None]:
from langchain.vectorstores import chromadb

db = chromadb.from_documents(documents, embedding)

In [None]:
question = "What is the name of lecturer?"
docs = db.similarity_search(question, k=3) # returns 3 relevant docs

## 5- Retrieval

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = "/kaggle/working/"

embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory = persist_directory,
    embedding_function = embedding
)

texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

db = Chroma.from_texts(
    texts,
    embedding = embedding
)

question = "Tell me about all-white mushrooms with large fruiting bodies"

### Accessing the data in the vector store
#### 1- similarity search

In [None]:
# similarity search
db.similarity_search(question, k=2)

#### 2- Maximum marginal relevance

In [None]:
# maximum marginal relevance
db.max_marginal_relevance_search(question, fetch_k= 3, k= 2)

#### 3- metadata

In [None]:
# metadata
docs = vectordb.similarity_search(
   question,
   k=3,
   filter={"source":"Enter meta data"}
)

### LLM aided
#### SelfQuery

In [None]:
# This exmaple is from langchain chat with your data by deeplearning.ai
# This cell is for explanation You can't run on your machine because you don't have a database
# As a solution u can create your own database as explained in vector store section.

from langchian.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]
     

document_content_description = "Lecture notes"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
question = "what did they say about regression in the third lecture?"

## Compression

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

### IN THE FOLLOWING NOTEBOOK I AM GOING TO EXPRIENCE SOME EXMAPLES WHICH WILL GET EVERYTHING TOGETHER