# LangChain Text Splitter

**Imports**

In [27]:
import openai
import os
from dotenv import load_dotenv, find_dotenv
import numpy as np
from IPython.display import display, Markdown, HTML

import urllib
from PyPDF2 import PdfReader
from langchain.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader, PyPDFLoader

from langchain.text_splitter import CharacterTextSplitter

from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

from langchain.docstore.document import Document
from langchain.vectorstores import DocArrayInMemorySearch, FAISS, Chroma
from langchain.indexes import VectorstoreIndexCreator

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI 
from langchain.chains import RetrievalQA

from langchain.callbacks import get_openai_callback

import spacy
nlp = spacy.load('en_core_web_sm')

In [10]:
# Load and set API key
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

model_name="gpt-3.5-turbo"

## Load the Document

**Read the webpage as a LangChain Document**

In [3]:
# # Download the page
# webpage = 'https://en.wikipedia.org/wiki/Artificial_intelligence'
# webpage_path = 'data/artificial-intelligence.html'
# urllib.request.urlretrieve(webpage, webpage_path)

# # Create a loder for the page
# loader = BSHTMLLoader(webpage_path, open_encoding='utf-8')
# docs = loader.load()
# docs

In [4]:
# file_path = 'data/Survey of Success Factors in Data Science Project.pdf'

# def extract_text_from_pdf(file_path):
#     with open(file_path, 'rb') as file:
#         pdf = PdfReader(file)
#         text = " ".join(page.extract_text() for page in pdf.pages)
#     return text

# # Extract text from the PDF and split it into sentences
# text = extract_text_from_pdf(file_path)
# text

## Create a QA chain

In [5]:
# with get_openai_callback() as cb:
#     # Shorthand for the index creator
#     index = VectorstoreIndexCreator(
#         embedding=OpenAIEmbeddings(),
#         vectorstore_cls=Chroma,
#     ).from_documents(final_docs)

#     query = "What is this document about?"
#     response = index.query(
#         llm=ChatOpenAI(),
#         question=query, 
#         chain_type="stuff",
#     )

#     print(cb)

# print(response)

### Stuff

In [56]:
# ------- Load the PDF ------- #
file_path = 'data/Attention Is All You Need.pdf'
loader = PyPDFLoader(file_path)
data = loader.load()

# Split the PDF into chunks
splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    separator = " ",
)
docs = splitter.split_documents(data)


# ------- Vector Store ------- #

# Create Embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',        # "BAAI/bge-small-en-v1.5"
    encode_kwargs={'normalize_embeddings': True},               # set True to compute cosine similarity
    query_instruction="Generate a representation for this sentence that can be used to retrieve related articles："
)

# Create vector store
db = Chroma.from_documents(
    docs,
    embedding_model
)


# ------- Create Retriever------- #
"""
"similarity" searches by similarity while
"mmr" also searches by diversity, which means
1st first source document would be different from the 2nd.
"k=2" selects the 2 most similar chunks to the query to search from
"""
retriever = db.as_retriever(
    search_type="similarity",             
    search_kwargs={"k": 2}               # 
)


# ------- Create QA chain ------- #                                                                                                                                                                                      
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True
)


# ------- Run the chain ------- #
query = """What is this document about?"""
response = qa_chain({"query": query})

# Display the response
response



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'What is this document about?',
 'result': ' This document is about the Transformer, a transduction model relying entirely on self-attention to compute representations of its input and output, and the advantages it has over other models.',
 'source_documents': [Document(page_content=', 2015.\n[25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated\ncorpus of english: The penn treebank. Computational linguistics , 19(2):313–330, 1993.\n[26] David McClosky, Eugene Charniak, and Mark Johnson. Effective self-training for parsing. In\nProceedings of the Human Language Technology Conference of the NAACL, Main Conference ,\npages 152–159. ACL, June 2006.\n11', metadata={'page': 10, 'source': 'data/Attention Is All You Need.pdf'}),
  Document(page_content='summarization,\ntextual entailment and learning task-independent sentence representations [4, 27, 28, 22].\nEnd-to-end memory networks are based on a recurrent attention mechanism instead 

In [67]:
chat_history = [(query, response["answer"])]
query = "Your answer wasn't clear, provide a more detailed one"
qa_chain({"query": query, "chat_history": chat_history})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': "Your answer wasn't clear, provide a more detailed one",
 'chat_history': [('What is this document about?',
   ' This document is about the Transformer, a transduction model relying entirely on self-attention to compute representations of its input and output, and the advantages it has over other models.')],
 'result': ' The Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequence-aligned RNNs or convolution. It is based on a recurrent attention mechanism and has been shown to perform well on simple-language question answering and language modeling tasks. It is an encoder-decoder structure which maps an input sequence of symbol representations to a sequence of continuous representations, and then generates an output sequence from the representations.',
 'source_documents': [Document(page_content='summarization,\ntextual entailment and learning task-independent sentence representati

### Map Reduce

The "map_reduce" chain 

In [100]:
# ------- Create Retriever------- #
"""
"similarity" searches by similarity while
"mmr" also searches by diversity, which means
1st first source document would be different from the 2nd.
"k=2" selects the 2 most similar chunks to the query to search from
"""
retriever = db.as_retriever(
    search_type="mmr",         
    search_kwargs={"k": 3}
)


# ------- Create QA chain ------- #                                                                                                                                                                                      
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="map_reduce",
    retriever=retriever,
    return_source_documents=True,
    verbose=True
)


# ------- Run the chain ------- #
query = """What is a transformer?"""
response = qa_chain({"query": query})

# Display the response
display(HTML(response["result"]))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


### Refine

While "map_reduce" is fast, it provides brief answers, the "refine" chain type passes sequentially the output of the previous documents to the next one, thus building consistent and more complete answers. 

In [98]:
# ------- Create QA chain ------- #                                                                                                                                                                                      
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="refine",
    retriever=retriever,
    return_source_documents=True,
    verbose=True
)


# ------- Run the chain ------- #
query = """What is a transformer?"""
response = qa_chain({"query": query})

# Display the response
display(HTML(response["result"]))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
