In [1]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu
!pip install pypdf langchain_community
!pip install -U langchain-community
!pip install -U langchain transformers

In [2]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [3]:
from langchain.schema import Document
import json

# Path to the JSON file containing chunked data
json_file_path = '..data/data_json/chunked_data_all_folders_cleaned.json'

# Load the JSON data
with open(json_file_path, 'r') as json_file:
    chunked_data = json.load(json_file)

# Initialize list to store documents
documents = []

# Process each entry in the JSON data
for entry in chunked_data:
    # Extract fields from JSON entry
    original_content = entry['content']
    folder_name = entry['folder_name']
    file_name = entry['file_name']

    # Create Document objects for each entry with metadata
    doc = Document(
        page_content=original_content,
        metadata={
            'folder_name': folder_name,
            'file_name': file_name
        }
    )
    documents.append(doc)

# # Check the first two documents as an example
# for doc in documents[:2]:
#     print(doc.metadata)
#     print(doc.page_content)
#     print('-' * 40)

print(len(documents))

Mounted at /content/drive
4881


In [2]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [5]:
db = FAISS.from_documents(documents, embeddings)

In [6]:
# Perform a similarity search with the question
question = "What are the policies for maintaining a safe and healthy school environment??"
searchDocs = db.similarity_search(question)

# Loop through relevant documents and print its content
for i, doc in enumerate(searchDocs):
    print(f"Document {i+1}:\n{doc.page_content}\n{'-'*40}")


Document 1:
Committee or school -based Wellness Council, shall comply with 
existing city ordinances and District policies related to promoting 
and managing healthy school environments. Examples of 
relevant and existing healthy school environment policies, for 
which school -based Wellness Councils and school staff must 
comply, are referenced below :
----------------------------------------
Document 2:
Facilities Management Department, in partnership with school 
leadership, will take action to mitigate critical issues such as 
unhealthy indoor air quality, s igns of pests, leaks, clutter, mold, 
unsatisfactory chemical management, and critical health and 
safety repairs. In addition, the audit results, along with best 
practices in the Healthy School Environment Resource Toolkit , 
shall be used by school principals/heads of school and school -
based Wellness Councils to develop annual environmental health 
priorities and goals as part of the school’s Wellness Action Plan.  
Distri