In [1]:
from dotenv import load_dotenv
import os 
import warnings
# Configuration
load_dotenv()
gemini_api_key = os.getenv("GOOGLE_API_KEY")
warnings.filterwarnings("ignore")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_url = os.getenv("QDRANT_URL")

In [10]:
# Configurations
load_dotenv()
gemini_api_key = os.getenv("GOOGLE_API_KEY")
warnings.filterwarnings("ignore")
nest_asyncio.apply()
qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_url = os.getenv("QDRANT_URL")

In [2]:
import os
import getpass
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Qdrant
import qdrant_client
import nest_asyncio
import json
import warnings
import re
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv


In [4]:
# list of URLs
EU_tourism_URLs = ['https://www.planetware.com/tourist-attractions-/madrid-e-mad-mad.htm',
                   'https://www.planetware.com/tourist-attractions-/barcelona-e-cat-bar.htm',
                   'https://www.planetware.com/tourist-attractions-/milan-i-lo-m.htm',
                   'https://www.planetware.com/tourist-attractions-/monaco-mc-mc-mon.htm',
                   'https://www.trawell.in/blog/stunning-places-to-visit-in-berlin',
                   'https://www.trawell.in/blog/popular-tourist-places-in-london',
                   'https://www.trawell.in/blog/most-beautiful-places-to-visit-in-paris',
                   'https://www.planetware.com/tourist-attractions-/prague-cz-pr-p.htm']

In [12]:
# Gemini AI vector Embedding Model
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Gemini AI model
llm = ChatGoogleGenerativeAI(model="gemini-pro", gemini_api_key=gemini_api_key)

In [6]:
# Web Scrapping loader
nest_asyncio.apply()

def webscrapper(urls: list):


    # load URLs
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()


    # Apply BS4 transformer
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
            # Extract content from given tags
            docs, tags_to_extract=["p", "h2", "span"]
        )
   
    # Perform Tokenization using Text Splitter
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=3000,
        chunk_overlap=0)
    print('\n>Splitting documents into chunks')
    chunks = splitter.split_documents(docs_transformed)
    return chunks

In [8]:
# Create Qdrant Collection to store vector embeddings
def qdrant_collection(text_chunks, embedding_model, collection_name):
    print("> Creating QdrantDB connection")
    # Create a Qdrant Client
    client = qdrant_client.QdrantClient(
        qdrant_url,
        api_key=qdrant_api_key
    )

    print(">\nQdrant connection established.")
    # Create a collection
    vectors_config = qdrant_client.http.models.VectorParams(
        size=768, # Define fixed size of chunk to store
        distance=qdrant_client.http.models.Distance.COSINE
    )
   
    # Let's create collection (Using recreate so we can run this multiple times)
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=vectors_config
    )
   
    # Save in Qdrant DB
    qdrant = Qdrant.from_documents(
        text_chunks,
        embedding_model,
        url=qdrant_url,
        api_key=qdrant_api_key,
        prefer_grpc=True,
        collection_name=collection_name
    )
    print("> Chunk of text saved in Qdrant DB")

In [13]:
print("WEB SCRAPPING AND VECTOR EMBEDDING PROCESS BEGINS")
docs = webscrapper(EU_tourism_URLs)
qdrant_collection(docs, embeddings, collection_name='europe-tour')

USER_AGENT environment variable not set, consider setting it to identify your requests.


WEB SCRAPPING AND VECTOR EMBEDDING PROCESS BEGINS


I0000 00:00:1727807410.509360 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1727807412.753926 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1727807414.418227 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1727807415.683293 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1727807416.982601 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1727807421.287915 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1727807425.900262 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1727807429.612950 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, s


>Splitting documents into chunks
> Creating QdrantDB connection
>
Qdrant connection established.
> Chunk of text saved in Qdrant DB


In [16]:
from langchain_community.vectorstores import Qdrant
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os
import textwrap
import qdrant_client
import streamlit as st
import random
from PIL import Image

# Configuration
load_dotenv()
gemini_api_key = os.getenv("GOOGLE_API_KEY")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_url = os.getenv("QDRANT_URL")

In [18]:
sys_prompt = """
    You are helpful tourism search engine. Give information about that places by given query. Always answer as helpful and as relevant
    as possible. While being informative. Keep answer length about 100-200 words. 
    If you don't know the answer to a question, please don't share false information.    
"""

instruction = """CONTEXT:/n/n {context}/n
Query: {question}
"""

In [24]:
def get_prompt(sys_prompt,instruction):
    prompt_template =  sys_prompt + instruction
    return prompt_template

In [25]:
# Get vector store in action
def get_vector_store():
    # Connect to the QdrantDB Cloud
    client = qdrant_client.QdrantClient(
        qdrant_url,
        api_key=qdrant_api_key
    )
   
    # Define Embeddings
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
   
    # Vector store for Retrieval
    vector_store = Qdrant(
        client=client,
        collection_name='europe-tour',
        embeddings=embeddings
    )
   
    return vector_store

In [26]:
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')


    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]


    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)


    return wrapped_text

In [27]:
def process_llm_response(llm_response):
    # Get parsed answer
    text = wrap_text_preserve_newlines(llm_response['result'])
   
    # Uncouple metadata and return it
    sources=[]
    for source in llm_response["source_documents"]:
        sources.append(source.metadata['source'])
    return text, list(set(sources))

In [28]:
if __name__ == '__main__':
    # Define vector store
    vector_store = get_vector_store()
   
    # Using Gemini-Pro
    llm = ChatGoogleGenerativeAI(model="gemini-pro", gemini_api_key=gemini_api_key,
                                 temperature=0.3,
                                 max_tokens=1024,
                                 convert_system_message_to_human=True)


    # Generate Prompt Template
    prompt_template = get_prompt(instruction, sys_prompt)
    QA_prompt = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )
   
    # Create Retrieval Chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever( search_kwargs={"k":3}),
        return_source_documents=True, # Get source
        chain_type_kwargs={"prompt":QA_prompt}
    )
   
    # Set Streamlit UI
    st.set_page_config(page_title="AI Tour Assistant")
   
   
    st.markdown("# AI Europe-Tour Assistant")
   
    image = Image.open('europe_banner.jpg')
    st.image(image, caption='by Karan Shingde', use_column_width=True)
   
    st.header("Tell us about your dream Europe destination?")
   
    # Create text box so user can write query
    user_question = st.text_input("What place would you love to explore?")
    if user_question:
        llm_res = qa_chain.invoke(user_question) # Generate response
        response, sources = process_llm_response(llm_res) # Trim it using Output Parser
        st.write()
        st.write()
        st.markdown("### Based on your search:")
        st.write(f"{response}")
        st.markdown("##### Source: ")
        for source in sources: # Display source URLs
            st.markdown(f"[{source}]({source})", unsafe_allow_html=True)

2024-10-01 14:34:54.308 
  command:

    streamlit run /Users/chaitanya/Documents/ML/github/AI-Travel-Assistant/env/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-10-01 14:34:54.478 Session state does not function when running a script without `streamlit run`


In [32]:
pip install langchain-qdrant


I0000 00:00:1727808989.811103 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


Note: you may need to restart the kernel to use updated packages.


In [37]:
pip show langchain-qdrant


I0000 00:00:1727809116.970857 2388306 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


Name: langchain-qdrant
Version: 0.1.4
Summary: An integration package connecting Qdrant and LangChain
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/chaitanya/Documents/ML/github/AI-Travel-Assistant/env/lib/python3.12/site-packages
Requires: langchain-core, pydantic, qdrant-client
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_qdrant import Qdrant
