In [10]:
!pip install nest_asyncio langchain_community langchain playwright html2text sentence-transformers faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [4]:
!playwright install

Downloading Chromium 121.0.6167.57 (playwright build v1097)[2m from https://playwright.azureedge.net/builds/chromium/1097/chromium-mac-arm64.zip[22m
Chromium 121.0.6167.57 (playwright build v1097) downloaded to /Users/aboniasojasingarayar/Library/Caches/ms-playwright/chromium-1097
Downloading FFMPEG playwright build v1009[2m from https://playwright.azureedge.net/builds/ffmpeg/1009/ffmpeg-mac-arm64.zip[22m
FFMPEG playwright build v1009 downloaded to /Users/aboniasojasingarayar/Library/Caches/ms-playwright/ffmpeg-1009
Downloading Firefox 121.0 (playwright build v1438)[2m from https://playwright.azureedge.net/builds/firefox/1438/firefox-mac-13-arm64.zip[22m
Firefox 121.0 (playwright build v1438) downloaded to /Users/aboniasojasingarayar/Library/Caches/ms-playwright/firefox-1438
Downloading Webkit 17.4 (playwright build v1967)[2m from https://playwright.azureedge.net/builds/webkit/1967/webkit-mac-13-arm64.zip[22m
Webkit 17.4 (playwright build v1967) downloaded to /Users/aboniasojas

In [5]:
import nest_asyncio
nest_asyncio.apply()
from langchain_community.document_loaders import AsyncChromiumLoader


# Articles to index
articles = ["https://medium.com/@abonia/bertscore-explained-in-5-minutes-0b98553bfb71",
            "https://medium.com/@abonia/document-based-llm-powered-chatbot-bb316009de93/",]

# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()


In [11]:
from langchain_community.document_transformers import Html2TextTransformer
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# Converts HTML to plain text
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=100,
                                      chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed)

# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever()


Created a chunk of size 442, which is longer than the specified 100
Created a chunk of size 375, which is longer than the specified 100
Created a chunk of size 490, which is longer than the specified 100
Created a chunk of size 102, which is longer than the specified 100
Created a chunk of size 281, which is longer than the specified 100
Created a chunk of size 233, which is longer than the specified 100
Created a chunk of size 989, which is longer than the specified 100
Created a chunk of size 104, which is longer than the specified 100
Created a chunk of size 280, which is longer than the specified 100
Created a chunk of size 489, which is longer than the specified 100
Created a chunk of size 221, which is longer than the specified 100
Created a chunk of size 155, which is longer than the specified 100
Created a chunk of size 287, which is longer than the specified 100
Created a chunk of size 214, which is longer than the specified 100
Created a chunk of size 243, which is longer tha

In [22]:
from langchain import PromptTemplate
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA

prompt_template = """
### [INST] Instruction: Answer the question based on the medium article knowledge. Here is context to help:

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
#llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)
llm = Ollama(model="mistral")

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs={"prompt": prompt_template},)

answer = qa.invoke("What is cosine similarity?")


{'query': 'What is cosine similarity?',
 'result': ' Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. It is computed as the cosine of the angle between them, which indicates how similar they are in direction. The result ranges from -1 to 1, with 1 indicating perfect similarity and 0 indicating orthogonal (perpendicular) vectors.',
 'source_documents': [Document(page_content='The formula for cosine similarity is:\n\n> similarity(A, B) = (A . B) / (||A|| ||B||)', metadata={'source': 'https://medium.com/@abonia/document-based-llm-powered-chatbot-bb316009de93/'}),
  Document(page_content='Cosine similarity — This method measures the cosine of the angle between two\nvectors, which indicates how similar they are in direction. Cosine similarity\nranges from -1 to 1, with 1 indicating perfect similarity.', metadata={'source': 'https://medium.com/@abonia/document-based-llm-powered-chatbot-bb316009de93/'}),
  Document(page_content='Cosine Simil

In [29]:
answer = qa.invoke("What is cosine similarity?")
print(answer['result'])

 Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. It calculates the cosine of the angle between them, which indicates how similar they are in direction. The result ranges from -1 to 1, with 1 indicating perfect similarity and 0 indicating orthogonal (perpendicular) vectors.
