In [2]:
!python -m venv .venv

In [7]:
!source .venv/bin/activate

In [8]:
!pip install bs4 PyPDFium2

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1256 sha256=7dec7e11376df5fe3194e5f602bcda8d72b233bbf7fc0dde64cf441a9b6ea979
  Stored in directory: /tmp/pip-ephem-wheel-cache-wiy61v4j/wheels/d4/c8/5b/b5be9c20e5e4503d04a6eac8a3cd5c2393505c29f02bea0960
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [20]:
import requests
from bs4 import BeautifulSoup

def get_url_paths(url, ext='', params={}):
    response = requests.get(url, params=params)
    if response.ok:
        response_text = response.text
    else:
        return response.raise_for_status()
    soup = BeautifulSoup(response_text, 'html.parser')
    parent = [url + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]
    return parent


In [21]:
url = 'http://xxx.xxx.xxx.xxx/pdf/demo'
ext = 'pdf'
result = get_url_paths(url, ext)
print(result)

['http://xxx.xxx.xxx.xxx/pdf/demoH19198-sql-po3-dg.pdf', 'http://xxx.xxx.xxx.xxx/pdf/demoH19353-si-sql-wp.pdf', 'http://xxx.xxx.xxx.xxx/pdf/demoH19653-sql-azure-wp.pdf', 'http://xxx.xxx.xxx.xxx/pdf/demoh17857-sql-containers-linux-wp.pdf', 'http://xxx.xxx.xxx.xxx/pdf/demoh18350-sql-big-vxrail-wp.pdf', 'http://xxx.xxx.xxx.xxx/pdf/demoh19332-sql-bp22-wp.pdf', 'http://xxx.xxx.xxx.xxx/pdf/demoh19462-sqlserver-objectstorage-dellstack-dg.pdf', 'http://xxx.xxx.xxx.xxx/pdf/demovxflex_sql_000049.pdf']


# Working with Embeddings and redis

In [28]:
redis_url = "redis://default:mydocpass@my-doc-headless.redisdb.svc.cluster.local:17073"
index_name = "pdfdemodocs"
url = 'http://xxx.xxx.xxx.xxx/pdf/demo'
ext = 'pdf'

In [29]:
#from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFium2Loader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.redis import Redis

In [30]:
from bs4 import BeautifulSoup
import requests

def listFD(url, ext=''):
    page = requests.get(url).text
    #print(page)
    soup = BeautifulSoup(page, 'html.parser')
    return [url + '/' + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]


In [31]:
for file in listFD(url, ext):
    print(file)

http://xxx.xxx.xxx.xxx/pdf/demo/H19198-sql-po3-dg.pdf
http://xxx.xxx.xxx.xxx/pdf/demo/H19353-si-sql-wp.pdf
http://xxx.xxx.xxx.xxx/pdf/demo/H19653-sql-azure-wp.pdf
http://xxx.xxx.xxx.xxx/pdf/demo/h17857-sql-containers-linux-wp.pdf
http://xxx.xxx.xxx.xxx/pdf/demo/h18350-sql-big-vxrail-wp.pdf
http://xxx.xxx.xxx.xxx/pdf/demo/h19332-sql-bp22-wp.pdf
http://xxx.xxx.xxx.xxx/pdf/demo/h19462-sqlserver-objectstorage-dellstack-dg.pdf
http://xxx.xxx.xxx.xxx/pdf/demo/vxflex_sql_000049.pdf


In [32]:
from langchain.document_loaders import PyPDFium2Loader

In [34]:
for file in listFD(url, ext):
    loader = PyPDFium2Loader(file)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=40)
    all_splits = text_splitter.split_documents(data)
    embeddings = HuggingFaceEmbeddings()
    rds = Redis.from_documents(all_splits,
                               embeddings,
                               redis_url=redis_url,
                               index_name=index_name)
    #rds.write_schema("pdfdemodocs_redis_schema.yaml")
    

In [35]:
rds.write_schema("pdfdemodocs_redis_schema.yaml")

# Querying a Redis index

In [36]:
redis_url = "redis://default:mydocpass@my-doc-headless.redisdb.svc.cluster.local:17073"
index_name = "pdfdemodocs"
schema_name = "pdfdemodocs_redis_schema.yaml"

In [37]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.redis import Redis

In [38]:
embeddings = HuggingFaceEmbeddings()
read_rds = Redis.from_existing_index(embeddings,
                                    redis_url=redis_url,
                                    index_name=index_name,
                                    schema=schema_name)

In [39]:
query="what is polybase?"
results =read_rds.similarity_search(query, k=4, return_metadata=True)
for result in results:
    print(result.metadata['source'])

/tmp/tmpmpnnn09y/tmp.pdf
/tmp/tmpyh4dwpnz/tmp.pdf
/tmp/tmpaw61f27z/tmp.pdf
/tmp/tmpyh4dwpnz/tmp.pdf


### Work with a retriever

In [40]:
retriever = read_rds.as_retriever(search_type="similarity_distance_threshold", search_kwargs={"k": 4, "distance_threshold": 2})

In [41]:
docs = retriever.get_relevant_documents(query)
docs

[Document(page_content='meaning that each shard contains a single copy of some data. Sharding improves \r\nperformance by enabling multiple data reads in parallel across multiple database \r\ninstances, thus reducing the time that is required to complete a query. \r\nSQL Server 2019 uses data virtualization to connect disparate data sources, enabling \r\nreporting and analytics without the need for an ETL process to assemble the data in a \r\ncommon data warehouse schema. Microsoft has integrated PolyBase with Big Data \r\nCluster, enabling organizations to unify structured and unstructured data sources. With \r\nPolyBase, organizations can access data from Azure SQL Database, Azure SQL Data \r\nWarehouse, Oracle, Teradata, MongoDB, Azure Cosmos DB, and HDFS.\r\nA key benefit of PolyBase for developers and data scientists is having one consistent \r\nuser interface for accessing multiple data sources. T-SQL is used to access external \r\ntable data, simplifying the creation of applicat