In [91]:
from bs4 import BeautifulSoup
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.document_loaders import WebBaseLoader
import nest_asyncio

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [97]:
nest_asyncio.apply()

In [3]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [5]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [11]:
t = soup.title
t

<title>The Dormouse's story</title>

In [13]:
t.get_text()

"The Dormouse's story"

In [15]:
links = soup.find_all('a')
links

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [17]:
for l in links:
    print(l.get_text())

Elsie
Lacie
Tillie


In [21]:
addr = []
for l in links:
    addr.append(l.get('href'))

# Requests

In [24]:
import requests

In [26]:
url = "https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives"

In [28]:
html = requests.get(url)

In [30]:
html

<Response [200]>

In [34]:
html_docs = html.content

In [36]:
soup = BeautifulSoup(html_docs, 'html.parser')

In [40]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-enabled vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of current members of the United States House of Representatives - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinn

In [53]:
table = soup.find(id="votingmembers")
print(table.prettify())

<table class="wikitable sortable sticky-header" id="votingmembers">
 <tbody>
  <tr style="vertical-align:bottom">
   <th scope="col">
    District
   </th>
   <th scope="col">
    Member
   </th>
   <th colspan="2" scope="col">
    Party
   </th>
   <th scope="col">
    Prior experience
   </th>
   <th scope="col">
    Education
   </th>
   <th scope="col">
    Assumed office
   </th>
   <th scope="col">
    Residence
   </th>
   <th scope="col">
    Born
    <sup class="reference" id="cite_ref-born_7-0">
     <a href="#cite_note-born-7">
      <span class="cite-bracket">
       [
      </span>
      7
      <span class="cite-bracket">
       ]
      </span>
     </a>
    </sup>
   </th>
  </tr>
  <tr>
   <th scope="row">
    <span data-sort-value="Alabama01 !">
     <a href="/wiki/Alabama%27s_1st_congressional_district" title="Alabama's 1st congressional district">
      Alabama 1
     </a>
    </span>
   </th>
   <td data-sort-value="Carl">
    <span typeof="mw:File">
     <a class="

In [101]:
pref = "https://en.wikipedia.org"

In [103]:
trs = table.find("tbody").find_all("tr")[1:]

In [105]:
replinks = []
for r in trs:
    p = r.find("b")
    if(p):
        replinks.append(pref + p.find("a").get("href"))

In [107]:
len(replinks)

432

In [119]:
loader = WebBaseLoader(replinks[:2])
loader.requests_per_second = 2
docs = loader.aload()


Fetching pages: 100%|#############################| 2/2 [00:00<00:00, 18.15it/s][A


In [123]:
from tqdm.autonotebook import tqdm, trange

In [127]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

splits = text_splitter.split_documents(docs)