# Simple Base RAG Pipeline 

### Testing the basic capabilities and configurations. 
### Load, Transform and Embed data before storing in a Vector Store

Import Libraries

In [1]:
import os 
import sys
import platform
import pkg_resources

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

from langchain_community.document_loaders import TextLoader, WebBaseLoader, PyPDFLoader
from dotenv import load_dotenv
import bs4 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

In [2]:
# Print Python, OS, and package versions
print(f"Python version: {platform.python_version()}")
print(f"OS: {platform.system()} {platform.release()}")


Python version: 3.10.5
OS: Darwin 23.4.0


Load API keys

In [3]:
load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')

#### 1. Data ingestion

From static text file

In [4]:
loader = TextLoader('../data/sample_congressional_hearing.txt')
text   = loader.load()

From web based html file 

In [5]:
# Load, index and chunk 
loader = WebBaseLoader(web_paths  = ('https://docs.house.gov/Committee/Calendar/ByEvent.aspx?EventID=110883',),
                       bs_kwargs  = dict(parse_only = bs4.SoupStrainer(
                           class_ = ('well', 'meeting-date', 'witnessPanel'))))

In [6]:
text   = loader.load()
text

[Document(page_content='\n\r\n                Hearing:\r\n               Online Platforms and Market Power, Part 6: Examining the Dominance of Amazon, Apple, Facebook, and Google\nSubcommittee on Antitrust, Commercial, and Administrative Law (Committee on the Judiciary)\n\n\n\r\n    Wednesday, July 29, 2020 (12:00 PM)\r\n                  Meeting was rescheduled to the time and date listed above.\r\n                \nMr. Jeff Bezos Chief Executive Officer, Amazon.com, Inc.\r\n        Added\r\n        07/28/2020 at 07:32 PM\nJeff Bezos Statement\r\n      [PDF]\r\n    \r\n        Added\r\n        07/28/2020 at 07:32 PM\nJeff Bezos Truth in Testimony\r\n      [PDF]\r\n    \r\n        Added\r\n        07/30/2020 at 01:24 PM\nJeff Bezos Bio\r\n      [PDF]\r\n    \r\n        Added\r\n        07/30/2020 at 01:24 PM\n\n\nMr. Sundar Pichai Chief Executive Officer, Alphabet Inc.\r\n        Added\r\n        07/28/2020 at 07:32 PM\nSundar Pichai Statement\r\n      [PDF]\r\n    \r\n        Added\r\

In [7]:
for doc in text:
    cleaned_content = doc.page_content.replace('\r', '').replace('\n', ' ').strip()
    print(cleaned_content)

Hearing:                Online Platforms and Market Power, Part 6: Examining the Dominance of Amazon, Apple, Facebook, and Google Subcommittee on Antitrust, Commercial, and Administrative Law (Committee on the Judiciary)        Wednesday, July 29, 2020 (12:00 PM)                   Meeting was rescheduled to the time and date listed above.                  Mr. Jeff Bezos Chief Executive Officer, Amazon.com, Inc.         Added         07/28/2020 at 07:32 PM Jeff Bezos Statement       [PDF]              Added         07/28/2020 at 07:32 PM Jeff Bezos Truth in Testimony       [PDF]              Added         07/30/2020 at 01:24 PM Jeff Bezos Bio       [PDF]              Added         07/30/2020 at 01:24 PM   Mr. Sundar Pichai Chief Executive Officer, Alphabet Inc.         Added         07/28/2020 at 07:32 PM Sundar Pichai Statement       [PDF]              Added         07/28/2020 at 07:32 PM Sundar Pichai Truth in Testimony       [PDF]              Added         07/30/2020 at 01:24 PM Sun

From static PDF

In [8]:
loader = PyPDFLoader('../data/sample_congressional_hearing.pdf')
text   = loader.load()

In [9]:
# Define the number of documents and the number of characters to preview
num_docs_to_preview = 3  # Number of documents to preview
preview_length = 500     # Number of characters to preview per document

# Print a preview of the loaded text content
for i, doc in enumerate(text):
    if i >= num_docs_to_preview:
        break
    preview_content = doc.page_content[:preview_length]
    print(f"Document {i+1} Preview:\n{preview_content}")
    print("\n--- End of Preview ---\n")

Document 1 Preview:
ONLINE PLATFORMS AND MARKET POWER, 
PART 6: EXAMINING THE DOMINANCE OF 
AMAZON, APPLE, FACEBOOK, AND GOOGLE 
HEARING 
BEFORE THE  
SUBCOMMITTEE ON ANTITRUST, COMMERCIAL AND 
ADMINISTRATIVE LAW 
OF THE  
COMMITTEE ON THE JUDICIARY 
HOUSE OF REPRESENTATIVES 
ONE HUNDRED SIXTEENTH CONGRESS 
SECOND SESSION 
JULY 29, 2020 
Serial No. 116–94 
Printed for the use of the Committee on the Judiciary 
( 
Available http://judiciary.house.gov or www.govinfo.gov 
VerDate Sep 11 2014 23:14 Mar 24, 2021 Jkt 041

--- End of Preview ---

Document 2 Preview:
ONLINE PLATFORMS AND MARKET POWER, PART 6: EXAMINING THE DOMINANCE OF AMAZON, 
APPLE, FACEBOOK, AND GOOGLE 
VerDate Sep 11 2014 23:14 Mar 24, 2021 Jkt 041317 PO 00000 Frm 00002 Fmt 6019 Sfmt 6019 E:\HR\OC\A317.XXX A317khammond on DSKJM1Z7X2PROD with HEARING

--- End of Preview ---

Document 3 Preview:
U.S. GOVERNMENT PUBLISHING OFFICE
WASHINGTON : 41–317 2020 ONLINE PLATFORMS AND MARKET POWER, 
PART 6: EXAMINING THE DOMINANCE OF 


#### 2. Transform

In [10]:
# Chunk the document using recursive text splitter
text_spiltter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
split_doc     = text_spiltter.split_documents(text)

print("Number of chunks:", len(text), "\n")
print("Preview below:\n")
print(split_doc[:3])

Number of chunks: 758 

Preview below:

[Document(page_content='ONLINE PLATFORMS AND MARKET POWER, \nPART 6: EXAMINING THE DOMINANCE OF \nAMAZON, APPLE, FACEBOOK, AND GOOGLE \nHEARING \nBEFORE THE  \nSUBCOMMITTEE ON ANTITRUST, COMMERCIAL AND \nADMINISTRATIVE LAW \nOF THE  \nCOMMITTEE ON THE JUDICIARY \nHOUSE OF REPRESENTATIVES \nONE HUNDRED SIXTEENTH CONGRESS \nSECOND SESSION \nJULY 29, 2020 \nSerial No. 116–94 \nPrinted for the use of the Committee on the Judiciary \n( \nAvailable http://judiciary.house.gov or www.govinfo.gov \nVerDate Sep 11 2014 23:14 Mar 24, 2021 Jkt 041317 PO 00000 Frm 00001 Fmt 6011 Sfmt 6011 E:\\HR\\OC\\A317.XXX A317khammond on DSKJM1Z7X2PROD with HEARING', metadata={'source': '../data/sample_congressional_hearing.pdf', 'page': 0}), Document(page_content='ONLINE PLATFORMS AND MARKET POWER, PART 6: EXAMINING THE DOMINANCE OF AMAZON, \nAPPLE, FACEBOOK, AND GOOGLE \nVerDate Sep 11 2014 23:14 Mar 24, 2021 Jkt 041317 PO 00000 Frm 00002 Fmt 6019 Sfmt 6019 E:\\HR\\OC\\

#### 3. Embed

In [11]:
# Convert to vector embeddings using OpenAI
# Store vector embeddings in vectore database (vector store) 

db = Chroma.from_documents((split_doc[:20]), OpenAIEmbeddings())

In [12]:
# Query the vector database

query = "What is this text about?"
db.similarity_search(query)[0].page_content

'LETTERS, STATEMENTS, ETC., SUBMITTED FOR THE HEARING \nExhibits Used at Hearing ....................................................................................... 42 ‘‘Police Requests for Google Users’ Location Histories Face New Scrutiny,’’ \na Wall Street Journal article for the record from the Honorable Kelly Armstrong, Member, Subcommittee on Antitrust, Commercial and Adminis-trative Law ........................................................................................................... 89 \nLetter to Apple by Representatives Greg Walden and Cathy McMorris Rod-\ngers from the Honorable Kelly Armstrong, Member, Subcommittee on Anti-trust, Commercial and Administrative Law ...................................................... 95 \nLetter to Google by Representatives Greg Walden and Cathy McMorris Rod-\ngers from the Honorable Kelly Armstrong, Member, Subcommittee on Anti-trust, Commercial and Administrative Law ...................................................... 98'