In [32]:
from supabase.client import Client, create_client
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import AsyncChromiumLoader
from langchain.vectorstores import SupabaseVectorStore
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer
import requests
import json
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import ast
from dotenv import load_dotenv
import os
from typing import List

In [33]:
load_dotenv()

SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_KEY = os.environ.get("SUPABASE_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")

prompt = """You are a highly advanced website scraper bot that understands phone numbers, emails and URLs.\
          Using the context below, perform the following task and return the output in JSON format with the keys: phone_number, email\
          Do not hallucinate anything or invent any response. Strictly follow the JSON format. If you don't find the value to the key, return None.\
          Context Information:\
          {context}\
          Task Instruction:\
        {question}"""



In [34]:
def initialize_supabase_db():
    supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
    return supabase


In [35]:
def scraper_with_playwright(url):
    # loader = AsyncChromiumLoader(url)
    loader = AsyncHtmlLoader(url, default_parser="html5lib")
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["header", "p", "li", "div", "a", "footer"]
    )
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    docs = splitter.split_documents(docs_transformed)
    return docs

In [36]:
def create_supabase_vectorstore(embeddings: OpenAIEmbeddings, docs):
    vector_store = SupabaseVectorStore.from_documents(
        docs,
        embedding = embeddings,
        client=initialize_supabase_db(),
        table_name="documents",
        query_name="match_documents",
    )
    return vector_store

In [37]:
def contact_info_scraper_using_llm(prompt: str, docs):
    llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    vector_retriever = create_supabase_vectorstore(embeddings, docs)

    prompt_template = PromptTemplate(template=prompt,
                                     input_variables=["question", "context"])
    question = "Extract the contact number and email address"
    qa = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = 'stuff',
        retriever = vector_retriever.as_retriever(),
        chain_type_kwargs = {'prompt': prompt_template}
    )
    #fetch the query instruction from the prompts file
    res = qa({'query': question})
    return res

In [38]:
def get_google_search_results(query, filters, num_results):
    # Make a GET request to the Google Custom JSON Search API
    for filter in filters:
        query = query + " -inurl:" + filter
    url = "https://google.serper.dev/search"
    payload = json.dumps({
    "q": query,
    "num": num_results
    })
    headers = {
    'X-API-KEY': SERPER_API_KEY,
    'Content-Type': 'application/json'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    # Parse the JSON response
    data = ast.literal_eval(response.content.decode())

    # Extract the URLs of the search results
    urls = []
    for item in data["organic"]:
        url = item["link"]
        urls.append(url)

    return urls

In [39]:
def scrape(query: str, exceptionFilters: List[str], num_results: int):
    client = initialize_supabase_db()
    urls = get_google_search_results(query, exceptionFilters, num_results)
    print("URL: ", urls)
    data = []
    for url in urls:
        docs = scraper_with_playwright([url])
        results = contact_info_scraper_using_llm(prompt, docs)
        client.table('documents').delete().neq('id', '00000000-0000-0000-0000-000000000000').execute()
        res = json.loads(results['result'])
        res['url'] = url
        data.append(res)
    return data

In [40]:
query = "Villas in Santa Teresa Costa Rica"
exceptionFilters = ['vrbo', 'trivago', 'airbnb', 'booking.com', 'hotels.com', 'expedia', 'tripadvisor']
query_num = 10

data = scrape(query, exceptionFilters, query_num)
print(data)

URL:  ['https://www.santateresaluxury.com/', 'https://www.luxuryvillasincostarica.com/mal-pais-santa-teresa/', 'https://villassantateresa.com/', 'https://www.tripadvisor.com/Hotel_Review-g635538-d1977802-Reviews-Villas_Santa_Teresa-Santa_Teresa_Province_of_Puntarenas.html', 'https://casateresacr.com/', 'https://www.joyavillascostarica.com/', 'https://www.i-escape.com/santa-teresa-luxury-villas', 'https://calavacationhomes.com/', 'https://www.tripadvisor.com/Hotels-g635538-c3-zff22-Santa_Teresa_Province_of_Puntarenas-Hotels.html', 'https://www.vacationscostarica.com/santa-teresa/rentals/']


Fetching pages: 100%|##########| 1/1 [00:02<00:00,  2.14s/it]
2023-12-18 13:59:44,327:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 13:59:45,383:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 13:59:45,821:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 13:59:45,999:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 13:59:48,418:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "+506 8683 1230",\n  "email": "info@santateresaluxury.com"\n}'}


2023-12-18 13:59:48,823:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.80s/it]
2023-12-18 13:59:53,846:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 13:59:54,665:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 13:59:55,173:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 13:59:55,291:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 13:59:57,018:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-18 13:59:57,222:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 20

{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "+1 (754) 223 0031",\n  "email": null\n}'}


Fetching pages: 100%|##########| 1/1 [00:03<00:00,  3.47s/it]
2023-12-18 14:00:05,229:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:05,532:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:06,029:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:06,175:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:08,182:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "+506 8494 5273",\n  "email": "santateresavillas@gmail.com"\n}'}


2023-12-18 14:00:08,589:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.89s/it]
2023-12-18 14:00:12,892:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:13,837:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:14,311:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:14,444:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:16,241:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": null,\n  "email": null\n}'}


2023-12-18 14:00:16,453:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.61s/it]
2023-12-18 14:00:21,011:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:21,695:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:22,145:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:22,289:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:24,259:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "+50688319711",\n  "email": "inquiries@casateresacr.com"\n}'}


2023-12-18 14:00:24,470:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.19it/s]
2023-12-18 14:00:26,921:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:27,255:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:27,600:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:27,756:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:29,344:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-18 14:00:29,443:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 20

{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "+1 310 489 6504",\n  "email": null\n}'}


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.10s/it]
2023-12-18 14:00:32,656:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:33,180:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:35,113:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:35,244:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:37,593:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "+44 (0) 117 946 7072",\n  "email": "help@i-escape.com"\n}'}


2023-12-18 14:00:37,819:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.84s/it]
2023-12-18 14:00:42,589:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:43,035:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:43,611:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:43,719:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:46,415:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "+506 8854 5858",\n  "email": "info@calavacationhomes.com"\n}'}


2023-12-18 14:00:46,661:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.35s/it]
2023-12-18 14:00:50,281:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:51,151:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:51,702:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:51,835:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:53,442:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-18 14:00:53,634:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 20

{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": null,\n  "email": null\n}'}


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.63it/s]
2023-12-18 14:00:55,797:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:56,047:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents "HTTP/1.1 201 Created"
2023-12-18 14:00:56,404:INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2023-12-18 14:00:56,512:INFO - HTTP Request: POST https://weflohyplshcasjyzpqa.supabase.co/rest/v1/rpc/match_documents?limit=4 "HTTP/1.1 200 OK"
2023-12-18 14:00:58,233:INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-12-18 14:00:58,330:INFO - HTTP Request: DELETE https://weflohyplshcasjyzpqa.supabase.co/rest/v1/documents?id=neq.00000000-0000-0000-0000-000000000000 "HTTP/1.1 200 OK"


{'query': 'Extract the contact number and email address', 'result': '{\n  "phone_number": "(800) 262-1578",\n  "email": null\n}'}
[{'phone_number': '+506 8683 1230', 'email': 'info@santateresaluxury.com', 'url': 'https://www.santateresaluxury.com/'}, {'phone_number': '+1 (754) 223 0031', 'email': None, 'url': 'https://www.luxuryvillasincostarica.com/mal-pais-santa-teresa/'}, {'phone_number': '+506 8494 5273', 'email': 'santateresavillas@gmail.com', 'url': 'https://villassantateresa.com/'}, {'phone_number': None, 'email': None, 'url': 'https://www.tripadvisor.com/Hotel_Review-g635538-d1977802-Reviews-Villas_Santa_Teresa-Santa_Teresa_Province_of_Puntarenas.html'}, {'phone_number': '+50688319711', 'email': 'inquiries@casateresacr.com', 'url': 'https://casateresacr.com/'}, {'phone_number': '+1 310 489 6504', 'email': None, 'url': 'https://www.joyavillascostarica.com/'}, {'phone_number': '+44 (0) 117 946 7072', 'email': 'help@i-escape.com', 'url': 'https://www.i-escape.com/santa-teresa-luxu

In [41]:
data

[{'phone_number': '+506 8683 1230',
  'email': 'info@santateresaluxury.com',
  'url': 'https://www.santateresaluxury.com/'},
 {'phone_number': '+1 (754) 223 0031',
  'email': None,
  'url': 'https://www.luxuryvillasincostarica.com/mal-pais-santa-teresa/'},
 {'phone_number': '+506 8494 5273',
  'email': 'santateresavillas@gmail.com',
  'url': 'https://villassantateresa.com/'},
 {'phone_number': None,
  'email': None,
  'url': 'https://www.tripadvisor.com/Hotel_Review-g635538-d1977802-Reviews-Villas_Santa_Teresa-Santa_Teresa_Province_of_Puntarenas.html'},
 {'phone_number': '+50688319711',
  'email': 'inquiries@casateresacr.com',
  'url': 'https://casateresacr.com/'},
 {'phone_number': '+1 310 489 6504',
  'email': None,
  'url': 'https://www.joyavillascostarica.com/'},
 {'phone_number': '+44 (0) 117 946 7072',
  'email': 'help@i-escape.com',
  'url': 'https://www.i-escape.com/santa-teresa-luxury-villas'},
 {'phone_number': '+506 8854 5858',
  'email': 'info@calavacationhomes.com',
  'url