### Preparing dataset

In [3]:
filename = "data/the_great_gatsby.txt"

word_count=0
with open(filename, 'r') as file:
    for line in file:
        words = line.split()
        word_count+= len(words)

print(f"Total number of words in {filename.split('/')[-1]} is: {word_count}")
        

Total number of words in the_great_gatsby.txt is: 51257


### Document Loaders

In [4]:
from langchain.document_loaders import TextLoader
loader = TextLoader(filename)
gatsby_book = loader.load()

In [11]:
type(gatsby_book[0])

langchain_core.documents.base.Document

Check more document loaders at [official documentation](https://python.langchain.com/docs/integrations/document_loaders/)

### Text Splitters

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size= 1000,
                                               chunk_overlap = 50,
                                               length_function = len,
                                               add_start_index = True)

In [13]:
texts = text_splitter.split_documents(gatsby_book)

In [17]:
print(texts[0])
print(texts[1])

page_content='﻿The Project Gutenberg eBook of The Great Gatsby
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Great Gatsby

Author: F. Scott Fitzgerald

Release date: January 17, 2021 [eBook #64317]
                Most recently updated: February 2, 2024

Language: English

Credits: Produced by Alex Cabal for the Standard Ebooks project, based on a transcription produced for Project Gutenberg Australia.


*** START OF THE PROJECT GUTENBERG EBOOK THE GREAT GATSBY ***




                           The Great Gatsby
                                  by
                          F. Sc

In [19]:
len(texts)

362

### Text Embeddings

In [24]:
from langchain_ollama import OllamaEmbeddings
nomic_embedding = OllamaEmbeddings(model='nomic-embed-text', base_url='http://localhost:11434')

### Vector Store

In [25]:
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(documents=texts, embedding=nomic_embedding)

In [26]:
query = "How does nick meets gatsby?"

vectorstore.similarity_search(query)

[Document(metadata={'source': 'data/the_great_gatsby.txt', 'start_index': 153027}, page_content='He was profoundly affected by the fact that Tom was there. But he\nwould be uneasy anyhow until he had given them something, realizing in\na vague way that that was all they came for. Mr. Sloane wanted\nnothing. A lemonade? No, thanks. A little champagne? Nothing at all,\nthanks … I’m sorry—\n\n“Did you have a nice ride?”\n\n“Very good roads around here.”\n\n“I suppose the automobiles—”\n\n“Yeah.”\n\nMoved by an irresistible impulse, Gatsby turned to Tom, who had\naccepted the introduction as a stranger.\n\n“I believe we’ve met somewhere before, Mr. Buchanan.”\n\n“Oh, yes,” said Tom, gruffly polite, but obviously not remembering.\n“So we did. I remember very well.”\n\n“About two weeks ago.”\n\n“That’s right. You were with Nick here.”\n\n“I know your wife,” continued Gatsby, almost aggressively.\n\n“That so?”\n\nTom turned to me.\n\n“You live near here, Nick?”\n\n“Next door.”\n\n“That so?”')

#### Generate

In [49]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_ollama import ChatOllama

template = """
Use the following pieces of context to answer the question at the end.
If you dont know the answer, just say 'Ah! looks like that's out of context!'.
Use three sentences maximum and keep answer as concise as possible. Use active voice and speak directly to the reader.
Do not give any extra output than answer.
context: {context}
Question: {question}
Helpful Answer: 
"""

QA_CHAIN_PROMPMT = PromptTemplate.from_template(template)

llama_llm = ChatOllama(model='llama3.2:1b', base_url='http://localhost:11434')

qa_chain = RetrievalQA.from_chain_type(llama_llm,
                                       retriever = vectorstore.as_retriever(),
                                       chain_type_kwargs={"prompt":QA_CHAIN_PROMPMT})

query = "What was Jay Gatsby’s real name?"

result = qa_chain.invoke({"query":query})

result['result']

"Jay Gatsby's real name was James Gatz."

### Using LCEL for retrieval

In [59]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup_and_retrieval = RunnableParallel(
    {"context": vectorstore.as_retriever(),
     'question': RunnablePassthrough()}
)

output_parser = StrOutputParser()

chain = setup_and_retrieval | QA_CHAIN_PROMPMT | llama_llm | output_parser

query = "What was Jay Gatsby’s real name?"

chain.invoke(query)

"Jay Gatsby's real name was James Gatz."