In [None]:
pip install pypdf



In [None]:
import os
from pypdf import PdfReader

In [None]:
fld='/content/rag_raw_data'
cfld='/content/clean_raw_data'

In [None]:
os.makedirs(cfld,exist_ok=True)

In [None]:
import re

In [None]:
def c_text(text):
  text=re.sub(r'\[\d+\]','',text)
  text=re.sub(r'\s+',' ',text)
  return text.strip()

In [None]:
for filename in os.listdir(fld):
  if filename.endswith('.pdf'):
    pth=os.path.join(fld,filename)
    rdr=PdfReader(pth)
    atxt=''
    for page in rdr.pages:
      txt=page.extract_text()
      if txt:
        atxt+=txt+'\n'
    ctxt=c_text(atxt)
    tfl=os.path.splitext(filename)[0]+'.txt'
    tpth=os.path.join(cfld,tfl)
    with open(tpth,'w',encoding='utf-8') as w:
      w.write(ctxt)

In [None]:
from transformers import AutoTokenizer

In [None]:
 chfld='chunk_data'
 os.makedirs(chfld,exist_ok=True)

In [None]:
tokenizer=AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
def chunks(text,max_tokens=150,overlap=40):
  tokens=tokenizer.encode(text,add_special_tokens=False)
  chunks=[]
  start=0
  while start<len(tokens):
    end=start+max_tokens
    chktok=tokens[start:end]
    chltxt=tokenizer.decode(chktok,skip_special_tokens=True)
    chunks.append(chltxt)
    start=end-overlap
    if start<0:
      start=0
  return chunks

In [None]:
for filename in os.listdir(cfld):
  if filename.endswith('.txt'):
    txt_pth=os.path.join(cfld,filename)
    with open(txt_pth,'r',encoding='utf-8') as r:
      ctxt=r.read()
    chks=chunks(ctxt)

    for i,chk in enumerate(chks):
      chunk_filename = f"{os.path.splitext(filename)[0]}_chunk{i}.txt"
      chunk_path = os.path.join(chfld, chunk_filename)
      with open(chunk_path, "w", encoding="utf-8") as cf:
        cf.write(chk)

Token indices sequence length is longer than the specified maximum sequence length for this model (55310 > 512). Running this sequence through the model will result in indexing errors


In [None]:
pip install chromadb



In [None]:
from chromadb import Client
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

In [None]:
clt=Client(Settings(persist_directory='/content/chroma_db'))

In [None]:
collection_name = "chandrayaan_mission_rag"
existing_collections = [col.name for col in clt.list_collections()]
if collection_name in existing_collections:
  collection = clt.get_collection(name=collection_name)
else:
  collection = clt.create_collection(name=collection_name)

In [None]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
for filename in os.listdir(chfld):
  if filename.endswith(".txt"):
      chunk_path = os.path.join(chfld, filename)
      with open(chunk_path, "r", encoding="utf-8") as r:
        text = r.read()
      emb = embedder.encode(text).tolist()
      collection.add(
          documents=[text],
          metadatas=[{"source": filename}],
          ids=[filename],
          embeddings=[emb]
        )

In [None]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM,pipeline
import torch

In [None]:
tokeni=AutoTokenizer.from_pretrained('google/flan-t5-base')

In [None]:
mdl=AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')

In [None]:
generate=pipeline('text2text-generation',model=mdl,tokenizer=tokeni)

Device set to use cpu


In [None]:
def rag_query(q,top_k=3,max_lenth=289):
  q_emb=embedder.encode(q).tolist()
  top=collection.query(query_embeddings=[q_emb],n_results=top_k)
  top_chks=top['documents'][0]

  context='\n\n'.join(top_chks)

  pmt =f''' You are an expert on the Chandrayaan-1 and Chandrayaan-2 mission and its scientific payloads.
Use the following retrieved context to answer the question as accurately as possible.
If the answer is not found in the context, state that the information is not available in the provided document.
Context:
{context}
Question: {q}
Answer in a complete sentence:'''
  ans=generate(pmt,max_length=max_lenth,min_length=5,do_sample=False)[0]['generated_text']
  return ans

In [None]:
quest='What was the scientific objective of the Terrain Mapping Camera (TMC)?'
rag_query(quest)

Both `max_new_tokens` (=256) and `max_length`(=289) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'to map the lunar surface in the panchromatic spectral band ( 0. 5 - 0. 8 microns ) with a high spatial resolution of 5 m and a swath of 20 km from 100 km lunar polar orbit'

In [None]:
quest="According to Chandrayaan 2, what two minor elements did the CLASS instrument detect remotely for the first time in the near-side Mare?"
rag_query(quest)

Both `max_new_tokens` (=256) and `max_length`(=289) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'cr and mn'