In [7]:
import platform
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader


In [9]:
system = platform.system()
if system == "Windows":
    BASE_PATH = "C:/Users/Austin/RAG/data"   
elif system == "Linux":
    BASE_PATH = "/home/austin/RAG/data"
else:
    BASE_PATH = "/home/austin/RAG/data"       

In [10]:
loader_info_txt = DirectoryLoader(
    f"{BASE_PATH}/info",
    glob="**/*.txt"
)
doc_info = loader_info_txt.load()

loader_pg_pdf = DirectoryLoader(
    f"{BASE_PATH}/pg",
    glob="**/*.pdf"
)
doc_pg = loader_pg_pdf.load()

loader_ug_syllabus = DirectoryLoader(
    f"{BASE_PATH}/ug/syllabus",
    glob="**/*.pdf"
)
doc_ug_syllabus = loader_ug_syllabus.load()

loader_ug_prospect = DirectoryLoader(
    f"{BASE_PATH}/ug/prospect",
    glob="**/*.pdf"
)
doc_ug_prospect = loader_ug_prospect.load()

doc_ug = doc_ug_prospect + doc_ug_syllabus




In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [12]:
text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)

chunks_ug = text_splitter.split_documents(doc_ug)
chunks_pg = text_splitter.split_documents(doc_pg)
chunks_info = text_splitter.split_documents(doc_info)

chunk_groups = [
    ("UG Documents", chunks_ug),
    ("PG Documents", chunks_pg),
    ("Info Documents", chunks_info)
]

for name, chunks in chunk_groups:
    print(f"{name}: Split into {len(chunks)} chunks.")


UG Documents: Split into 418 chunks.
PG Documents: Split into 237 chunks.
Info Documents: Split into 12 chunks.


In [13]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings


In [14]:
bge = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

In [18]:
from langchain_community.vectorstores import FAISS

all_chunks = chunks_pg + chunks_ug + chunks_info

In [19]:
vectorstore = FAISS.from_documents(all_chunks, bge)

In [20]:
vectorstore.save_local("rag_vector_db")


In [21]:
vectorstore = FAISS.load_local(
    "rag_vector_db",
    bge,
    allow_dangerous_deserialization=True
)

In [22]:
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 3}   # fetch top 3 relevant chunks
)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

def hf_llm(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=300)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(hf_llm("Hello!"))


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: 