In [1]:
!pip install requests beautifulsoup4
!pip install langchain
!pip install unstructured
!pip install faiss-cpu
!pip install sentence-transformers
!pip install -U langchain-community faiss-cpu langchain-openai tiktoken
!pip install -U sentence-transformers rank_bm25



In [14]:
import os
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import DirectoryLoader

def extract_info_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        info_text = '\n'.join(p.text for p in paragraphs)
        return info_text
    else:
        print("Failed to retrieve data from URL:", url)
        return None

def save_to_text_file(text, filename, folder='extracted_info'):
    if not os.path.exists(folder):
        os.makedirs(folder)
    file_path = os.path.join(folder, filename)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

if __name__ == "__main__":
    list_of_URLS = [
        'https://catalogue.uci.edu/allcourses/compsci/',
        'https://catalogue.uci.edu/donaldbrenschoolofinformationandcomputersciences/departmentofcomputerscience/computerscience_bs/#sampleprogramtext',
        'https://catalogue.uci.edu/donaldbrenschoolofinformationandcomputersciences/departmentofcomputerscience/computerscience_bs/#requirementstext'
    ]
    for url in list_of_URLS: 
        info_text = extract_info_from_url(url)
        print("Extracted info from {}".format(url))
        if info_text:
            file_name = 'extracted_info_{}.txt'.format(url.split('/')[-2])
            save_to_text_file(info_text, file_name)
            print("Information extracted and saved to '{}'".format(file_name))
    loader = DirectoryLoader('extracted_info', '*.txt')
    


Extracted info from https://catalogue.uci.edu/allcourses/compsci/
Information extracted and saved to 'extracted_info_compsci.txt'
Extracted info from https://catalogue.uci.edu/donaldbrenschoolofinformationandcomputersciences/departmentofcomputerscience/computerscience_bs/#sampleprogramtext
Information extracted and saved to 'extracted_info_computerscience_bs.txt'
Extracted info from https://catalogue.uci.edu/donaldbrenschoolofinformationandcomputersciences/departmentofcomputerscience/computerscience_bs/#requirementstext
Information extracted and saved to 'extracted_info_computerscience_bs.txt'


In [5]:
import getpass
hugging_face_api_key = getpass.getpass(prompt="Enter your Hugging Face API key: ")

Enter your Hugging Face API key:  ········


In [6]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
from langchain.storage import InMemoryStore
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever

def returnSplits():
    loader=DirectoryLoader('extracted_info', show_progress=True, use_multithreading=True)
    documents=loader.load()
    return documents


def createVectorStore():
    embeddings = HuggingFaceEmbeddings(model_name='WhereIsAI/UAE-Large-V1')
    store = InMemoryStore()
    vectorstore = FAISS.from_documents(returnSplits(), embedding=embeddings)
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=512)
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=256)
    retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        parent_splitter=parent_splitter
    )
    retriever.add_documents(returnSplits())
    final_path='faissDB'
    vectorstore.save_local(final_path)
    print("==== CREATED VECTOR DATABASE ===")
    return vectorstore
vectorstore = createVectorStore()

 67%|██████████████████████████████               | 2/3 [00:01<00:00,  1.04it/s]
 67%|██████████████████████████████               | 2/3 [00:00<00:00, 18.26it/s]


==== CREATED VECTOR DATABASE ===


In [12]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print("CUDA available, using GPU.")
else:
    device = torch.device('cpu')
    print("CUDA not available, using CPU.")

# Load the bi-encoder model
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)
bi_encoder.max_seq_length = 256  # Truncate long passages to 256 tokens
top_k = 32  # Number of passages we want to retrieve with the bi-encoder

# Encoding passages using the bi-encoder
corpus_embeddings_bi = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

# Your existing code for semantic search
def search(query):
    print("Input question:", query)
    # Semantic Search
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)  # Encode the query
    hits = util.semantic_search(question_embedding, corpus_embeddings_bi, top_k=top_k)  # Perform semantic search
    hits = hits[0]  # Get the hits for the first query


CUDA not available, using CPU.


NameError: name 'passages' is not defined

In [9]:
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

def loadVectorStore():
    final_path='faissDB'
    embeddings=HuggingFaceEmbeddings(model_name='WhereIsAI/UAE-Large-V1')
    vectorstore = FAISS.load_local(final_path,embeddings,allow_dangerous_deserialization=True)
    return vectorstore
vectorstore = loadVectorStore()
query = "What are the different courses at UCI?"
docs = vectorstore.similarity_search(query)
print(docs)

[Document(page_content='programing (python, java, C++), additional courses as specified by the major, and completion of lower-division writing. Students are encouraged to complete as many of the lower-division degree requirements as possible prior to transfer. Visit the UCI', metadata={'source': 'extracted_info/extracted_info_computerscience_bs.txt', 'doc_id': '8980d07d-b773-4227-a94c-8de26e7d773d'}), Document(page_content='as specified by the major, and completion of lower-division writing. Students are encouraged to complete as many of the lower-division degree requirements as possible prior to transfer. Visit the UCI Office of Admissions website', metadata={'source': 'extracted_info/extracted_info_computerscience_bs.txt', 'doc_id': '8980d07d-b773-4227-a94c-8de26e7d773d'}), Document(page_content='completion of lower-division writing. Students are encouraged to complete as many of the lower-division degree requirements as possible prior to transfer. Visit the UCI Office of Admissions 