## Importing the libraries

In [1]:
from supabase.client import Client, create_client
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import AsyncChromiumLoader
from langchain.vectorstores import SupabaseVectorStore
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer
import requests
import json
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import ast
from dotenv import load_dotenv
import os
from typing import List

## Defining the prompt and the environment variables

In [2]:
load_dotenv()

SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")

prompt = """You are a highly advanced website scraper bot that understands phone numbers, emails and URLs.\
          Using the context below, perform the following task and return the output in JSON format with the keys: phone_number, email\
          Do not hallucinate anything or invent any response. Strictly follow the JSON format. If you don't find the value to the key, return None.\
          Context Information:\
          {context}\
          Task Instruction:\
        {question}"""



## Initializing the supabase database

In [3]:
def initialize_supabase_db():
    supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
    return supabase


In [5]:
def create_supabase_vectorstore(embeddings: OpenAIEmbeddings, docs):
    vector_store = SupabaseVectorStore.from_documents(
        docs,
        embedding = embeddings,
        client=initialize_supabase_db(),
        table_name="documents",
        query_name="match_documents",
    )
    return vector_store

## Initializing the HTML page loader and document transformer

In [4]:
def scraper_with_AsyncHTMLLoader(url):
    # loader = AsyncChromiumLoader(url)
    loader = AsyncHtmlLoader(url, default_parser="html5lib")
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["header", "p", "li", "div", "a", "footer"]
    )
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    docs = splitter.split_documents(docs_transformed)
    return docs

## Defining the llm and retriever and extracting the contact information using RAG

In [6]:
def contact_info_scraper_using_llm(prompt: str, docs):
    llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vector_retriever = create_supabase_vectorstore(embeddings, docs)

    prompt_template = PromptTemplate(template=prompt,
                                     input_variables=["question", "context"])
    question = "Extract the contact number and email address"
    qa = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = 'stuff',
        retriever = vector_retriever.as_retriever(),
        chain_type_kwargs = {'prompt': prompt_template}
    )
    #fetch the query instruction from the prompts file
    res = qa({'query': question})
    return res

## Initializing the google search engine API and extracting the URLs

In [7]:
def get_google_search_results(query, filters, num_results):
    # Make a GET request to the Google Custom JSON Search API
    for filter in filters:
        query = query + " -inurl:" + filter
    url = "https://google.serper.dev/search"
    payload = json.dumps({
    "q": query,
    "num": num_results
    })
    headers = {
    'X-API-KEY': SERPER_API_KEY,
    'Content-Type': 'application/json'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    # Parse the JSON response
    data = ast.literal_eval(response.content.decode())

    # Extract the URLs of the search results
    urls = []
    for item in data["organic"]:
        url = item["link"]
        urls.append(url)

    return urls

## Scrape function to take the user query and extract the contact information

In [8]:
def scrape(query: str, exceptionFilters: List[str], num_results: int):
    client = initialize_supabase_db()
    urls = get_google_search_results(query, exceptionFilters, num_results)
    print("URL: ", urls)
    data = []
    for url in urls:
        docs = scraper_with_AsyncHTMLLoader([url])
        results = contact_info_scraper_using_llm(prompt, docs)
        # trick to delete all records from the documents table
        client.table('documents').delete().neq('id', '00000000-0000-0000-0000-000000000000').execute()
        res = json.loads(results['result'])
        res['url'] = url
        data.append(res)
    return data

In [9]:
query = "Villas in Santa Teresa Costa Rica"
exceptionFilters = ['vrbo', 'trivago', 'airbnb', 'booking.com', 'hotels.com', 'expedia', 'tripadvisor']
query_num = 10

data = scrape(query, exceptionFilters, query_num)
print(data)

URL:  ['https://www.santateresaluxury.com/', 'https://www.luxuryvillasincostarica.com/mal-pais-santa-teresa/', 'https://villassantateresa.com/', 'https://casateresacr.com/', 'https://www.joyavillascostarica.com/', 'https://www.i-escape.com/santa-teresa-luxury-villas', 'https://calavacationhomes.com/', 'https://www.vacationscostarica.com/santa-teresa/rentals/', 'https://www.seataya.com/', 'https://www.selvaresort.com/']


Fetching pages: 100%|##########| 1/1 [00:04<00:00,  4.49s/it]
2023-12-19 13:50:16,414:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-19 13:50:18,373:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-19 13:50:18,907:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-19 13:50:19,053:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-19 13:50:20,962:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-19 13:50:21,221:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"
Fetching pages: 100%|##########| 1/1 [00:02<00:00,  2.93s/it]
2023-12-19 13:50:27,922:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[{'phone_number': '+506 8683 1230', 'email': 'info@santateresaluxury.com', 'url': 'https://www.santateresaluxury.com/'}, {'phone_number': '+1 (754) 223 0031', 'email': None, 'url': 'https://www.luxuryvillasincostarica.com/mal-pais-santa-teresa/'}, {'phone_number': '+506 8494 5273', 'email': 'santateresavillas@gmail.com', 'url': 'https://villassantateresa.com/'}, {'phone_number': '+50688319711', 'email': 'inquiries@casateresacr.com', 'url': 'https://casateresacr.com/'}, {'phone_number': '+1 310 489 6504', 'email': None, 'url': 'https://www.joyavillascostarica.com/'}, {'phone_number': '+44 (0) 117 946 7072', 'email': 'help@i-escape.com', 'url': 'https://www.i-escape.com/santa-teresa-luxury-villas'}, {'phone_number': '+506 8854 5858', 'email': 'info@calavacationhomes.com', 'url': 'https://calavacationhomes.com/'}, {'phone_number': '(800) 262-1578', 'email': None, 'url': 'https://www.vacationscostarica.com/santa-teresa/rentals/'}, {'phone_number': None, 'email': None, 'url': 'https://www.s

In [10]:
data

[{'phone_number': '+506 8683 1230',
  'email': 'info@santateresaluxury.com',
  'url': 'https://www.santateresaluxury.com/'},
 {'phone_number': '+1 (754) 223 0031',
  'email': None,
  'url': 'https://www.luxuryvillasincostarica.com/mal-pais-santa-teresa/'},
 {'phone_number': '+506 8494 5273',
  'email': 'santateresavillas@gmail.com',
  'url': 'https://villassantateresa.com/'},
 {'phone_number': '+50688319711',
  'email': 'inquiries@casateresacr.com',
  'url': 'https://casateresacr.com/'},
 {'phone_number': '+1 310 489 6504',
  'email': None,
  'url': 'https://www.joyavillascostarica.com/'},
 {'phone_number': '+44 (0) 117 946 7072',
  'email': 'help@i-escape.com',
  'url': 'https://www.i-escape.com/santa-teresa-luxury-villas'},
 {'phone_number': '+506 8854 5858',
  'email': 'info@calavacationhomes.com',
  'url': 'https://calavacationhomes.com/'},
 {'phone_number': '(800) 262-1578',
  'email': None,
  'url': 'https://www.vacationscostarica.com/santa-teresa/rentals/'},
 {'phone_number': No