# Install dependencies

In [None]:
pip install langchain
pip install pinecone-client
pip install openai
pip install tiktoken
pip install nest_asyncio

# Set up OpenAI API Key

In [6]:
import os
os.environ["OPENAI_API_KEY"] = "Your API KEY here"


# Set up Pinecone API keys

In [11]:
import pinecone

#initialize pinecone
pinecone.init(
    api_key="your langchain api key here", 
    environment="your environment name",
)

# Index
Load data from 'https://ind.nl'
Extends from the WebBaselLoader, this will load a sitemap from a given URL, and then # scrape and load all the pages in the sitemap, returing each page as a document.

This scraping is done concurrently, using WebBaselLoader. There are reasonable limits to concurrent requests, defaulting to 2 per second.


In [13]:

import nest_asyncio
nest_asyncio.apply()

from langchain.document_loaders.sitemap import SitemapLoader

loader = SitemapLoader(
    "https://ind.nl/sitemap.xml",
    filter_urls=['https://ind.nl/en']

)

docs = loader.load()

Fetching pages: 100%|#########################| 509/509 [03:49<00:00,  2.22it/s]


## Text split
split the text from the docs into smaller chunks
There are many ways to split the text. We are using the text splitter that is recommended for generic texts. 

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1200,
    chunk_overlap = 200,
    length_function = len,
)

docs_chunks = text_splitter.split_documents(docs)

## Create Embeddings

In [18]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

## Creating a vectorstore
A vector stores Documents and associated embeddings, and provides fast ways to look up relevant
Documents by embeddings. There are many ways to create a vectorstore. We are going to use Pinecone. First create a new index in Pinecone and type the name in "index_name"



In [25]:
from langchain.vectorstores import Pinecone

# get this name from the index name you just created on Pinecone
index_name = "ind"

## create a new index
docsearch = Pinecone.from_documents(docs_chunks, embeddings, index_name=index_name)

#if you already have an index, you can load it like this
#docsearch = Pinecone.from_existing_index(index_name, embeddings)

## vector is ready. 
Let's try to query dosearch with similarity search

In [27]:
query = "How to get a visa for my partner if I have a highly skilled visa."
docs = docsearch.similarity_search(query)
print(docs[0])
#print(len(docs))

4


## Making a question answering chain 
The question chain will enable us to generate the answer based on the relevant context chucnks
Additionally, we can return the source of the document used to answer the question by specifying an optional parameter when constructing the chain

In [30]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
llm=OpenAI()

In [None]:
qa_with_response = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", 
                                               retriever=docsearch.as_retriever(),
                                              return_source_documents=True
                                              )

query = "How to get a visa for my parents if I have a highly skilled visa"
result = qa_with_response({'query': query})

# Output text result that was foung for the query
result['result']

## Output source documents that were found for the query

In [None]:
result['source_documents']