#Step 1 : indexing

#text loading

In [3]:
from langchain.document_loaders import PyMuPDFLoader

# Replace with your own PDF file path
file_path = "/content/drive/MyDrive/ask my pdf/defect detection in fabric.pdf"

# Load and split PDF into Document objects (each page)
loader = PyMuPDFLoader(file_path)
documents = loader.load()

# Preview the first page of text
print(documents[0].page_content[:1000])  # Show first 1000 characters


Towards partial fulfillment for Postgraduate Degree Level
Programme
Master of Technology in Computer Science and Engineering
with Specialization in Computer Science and Engineering
A Mini-Project Report on:
Defect Detection in Fabric
Prepared by:
Dhruv Hemal Shah
Admission No.: P24CS008
M.TECH. I (Computer Science and Engineering)
2nd Semester Year: 2024-2025
Guided by:
Dr. Ritu Tiwari
DEPARTMENT OF COMPUTER SCIENCE AND
ENGINEERING
SARDAR VALLABHBHAI NATIONAL INSTITUTE OF
TECHNOLOGY,
SURAT - 395007 (GUJARAT, INDIA)


#chunking

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 1: Set chunking parameters
chunk_size = 1000     # number of characters in each chunk
chunk_overlap = 200      # number of overlapping characters

# Step 2: Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Step 3: Split the loaded documents (pages) into smaller chunks
chunks = text_splitter.split_documents(documents)

# Step 4: Preview the first chunk
print(f"Total chunks created: {len(chunks)}")
print("-----------------------------------------------------------")
print(chunks[0].page_content[:500])


Total chunks created: 41
-----------------------------------------------------------
Towards partial fulfillment for Postgraduate Degree Level
Programme
Master of Technology in Computer Science and Engineering
with Specialization in Computer Science and Engineering
A Mini-Project Report on:
Defect Detection in Fabric
Prepared by:
Dhruv Hemal Shah
Admission No.: P24CS008
M.TECH. I (Computer Science and Engineering)
2nd Semester Year: 2024-2025
Guided by:
Dr. Ritu Tiwari
DEPARTMENT OF COMPUTER SCIENCE AND
ENGINEERING
SARDAR VALLABHBHAI NATIONAL INSTITUTE OF
TECHNOLOGY,
SURAT - 395


In [17]:
# Remove unnecessary newlines and merge text within each chunk
for i in range(len(chunks)):
    cleaned_content = chunks[i].page_content.replace('\n', ' ').replace('  ', ' ').strip()
    chunks[i].page_content = cleaned_content

# Preview a cleaned chunk
print(f"Total cleaned chunks: {len(chunks)}")
print("-----------------------------------------------------------")
print(chunks[0].page_content[:500])


Total cleaned chunks: 41
-----------------------------------------------------------
Towards partial fulfillment for Postgraduate Degree Level Programme Master of Technology in Computer Science and Engineering with Specialization in Computer Science and Engineering A Mini-Project Report on: Defect Detection in Fabric Prepared by: Dhruv Hemal Shah Admission No.: P24CS008 M.TECH. I (Computer Science and Engineering) 2nd Semester Year: 2024-2025 Guided by: Dr. Ritu Tiwari DEPARTMENT OF COMPUTER SCIENCE AND ENGINEERING SARDAR VALLABHBHAI NATIONAL INSTITUTE OF TECHNOLOGY, SURAT - 395


#Embeddings

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

#  Load the model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


#vectorization

In [18]:
from langchain.vectorstores import FAISS

# Step 1: Create the FAISS vector store from the chunks and the embedding model
vectorstore = FAISS.from_documents(chunks, embedding_model)

# Step 2: (Optional but recommended) Save it to disk for later use
vectorstore.save_local("/content/drive/MyDrive/ask my pdf/vectorstore")


In [19]:
vectorstore.index_to_docstore_id

{0: '190b7566-26bc-4cad-9cd0-264e52b1951c',
 1: '5ac0733f-9487-43a1-aab1-7c2885553cbb',
 2: '21c32b7a-d35d-43c9-bb2b-a8889ab3bb76',
 3: '8ad88976-b562-4c18-aeea-0d8e998e9765',
 4: '92f874b4-4c8d-4e28-b483-97c02a951506',
 5: '6232970d-5bee-4df4-b619-67d763e9ddce',
 6: '06e94d3f-f9ee-40e4-9533-fbe1fb02892c',
 7: '79fdceb6-1cbf-441d-b29e-5396fccee361',
 8: 'e74fda65-ab8e-40cd-9c19-dd48f9c31b65',
 9: '78cd2f46-c70a-4fdb-9933-f8bfda1ddd49',
 10: '4039f49e-9e00-4d80-87ff-5630c0394084',
 11: 'd5074299-d74d-486a-bc96-1911b44685a9',
 12: '04e4e2d0-bc60-4bac-8155-26c51d521fa1',
 13: '23d4b817-50f4-453e-98a4-4c943eb32c77',
 14: '3b084c5d-9fb4-4b63-ba97-6b0141185c4a',
 15: '973e62be-d0ad-4c00-9da1-e486e5747c14',
 16: '8cb343ec-7ad0-4fcb-8edc-44b0d8eada7c',
 17: '34fd0ce4-b914-4cfa-9c4e-dc097dca4837',
 18: 'ba9d0dca-a9d3-47ab-a830-9413d1a8decc',
 19: 'e8b0689e-161b-4fdb-a3b0-92a53e569da4',
 20: '92a7c431-a8aa-4ef3-9a29-adaa84a3567e',
 21: 'b214e4f0-962d-4c6b-8b3e-c7f569899e7e',
 22: '983b1b60-5aa5-

In [21]:
vectorstore.get_by_ids(['190b7566-26bc-4cad-9cd0-264e52b1951c'])

[Document(id='190b7566-26bc-4cad-9cd0-264e52b1951c', metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'TeX', 'creationdate': '2025-05-01T12:05:21+00:00', 'source': '/content/drive/MyDrive/ask my pdf/defect detection in fabric.pdf', 'file_path': '/content/drive/MyDrive/ask my pdf/defect detection in fabric.pdf', 'total_pages': 20, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-05-01T12:05:21+00:00', 'trapped': '', 'modDate': 'D:20250501120521Z', 'creationDate': 'D:20250501120521Z', 'page': 0}, page_content='Towards partial fulfillment for Postgraduate Degree Level Programme Master of Technology in Computer Science and Engineering with Specialization in Computer Science and Engineering A Mini-Project Report on: Defect Detection in Fabric Prepared by: Dhruv Hemal Shah Admission No.: P24CS008 M.TECH. I (Computer Science and Engineering) 2nd Semester Year: 2024-2025 Guided by: Dr. Ritu Tiwari DEPARTMENT OF COMPUTER SCIENCE AND ENGINEERING S

#step 2 : retriver and prompt

#creating a retriver

In [22]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [23]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x790ec02d8f50>, search_kwargs={'k': 4})

#prompt templete

In [25]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful AI assistant. Use the context below to answer the user's question.
If the answer is not contained in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:"""
)


# here the question is given and the retrived_docs is created

In [91]:
question = "what is defect detection in fabric?"
retrieved_docs    = retriever.invoke(question)


  return forward_call(*args, **kwargs)


In [92]:
retrieved_docs

[Document(id='21c32b7a-d35d-43c9-bb2b-a8889ab3bb76', metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'TeX', 'creationdate': '2025-05-01T12:05:21+00:00', 'source': '/content/drive/MyDrive/ask my pdf/defect detection in fabric.pdf', 'file_path': '/content/drive/MyDrive/ask my pdf/defect detection in fabric.pdf', 'total_pages': 20, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-05-01T12:05:21+00:00', 'trapped': '', 'modDate': 'D:20250501120521Z', 'creationDate': 'D:20250501120521Z', 'page': 2}, page_content='Certificate This is to certify that the mini project report entitled Defect Detection in Fabric is prepared and presented by Dhruv Hemal Shah Admission No.: P24CS008 of MTech First Year in Computer Science and Engineering with Specialization in Computer Science and Engineering and his work is satisfactory. SIGNATURE: Supervisor/s JURY HEAD OF DEPARTMENT'),
 Document(id='190b7566-26bc-4cad-9cd0-264e52b1951c', metadata={'producer': 'pd

# see the retrieved docs has a lot of unnecessary text in it, so we trim all that now and keep only the necessary one

In [93]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

'Certificate This is to certify that the mini project report entitled Defect Detection in Fabric is prepared and presented by Dhruv Hemal Shah Admission No.: P24CS008 of MTech First Year in Computer Science and Engineering with Specialization in Computer Science and Engineering and his work is satisfactory. SIGNATURE: Supervisor/s JURY HEAD OF DEPARTMENT\n\nTowards partial fulfillment for Postgraduate Degree Level Programme Master of Technology in Computer Science and Engineering with Specialization in Computer Science and Engineering A Mini-Project Report on: Defect Detection in Fabric Prepared by: Dhruv Hemal Shah Admission No.: P24CS008 M.TECH. I (Computer Science and Engineering) 2nd Semester Year: 2024-2025 Guided by: Dr. Ritu Tiwari DEPARTMENT OF COMPUTER SCIENCE AND ENGINEERING SARDAR VALLABHBHAI NATIONAL INSTITUTE OF TECHNOLOGY, SURAT - 395007 (GUJARAT, INDIA)\n\n[15] C. Liu, Y. Tao, J. Liang, K. Li, and Y. Chen, “Object detection based on yolo net- work,” in 2018 IEEE 4th info

# Augumentation

In [94]:
final_prompt = prompt_template.format(
    context=context_text,
    question= question
)

#Load HuggingFace LLM

In [78]:
  from langchain.llms import HuggingFacePipeline
  from transformers import pipeline

  # You can use a light model like flan-t5-small or flan-t5-base
  llm_pipeline = pipeline(
      "text2text-generation",
      model="google/flan-t5-base",
      tokenizer="google/flan-t5-base",
      max_length=512,
      temperature=0
  )

  llm = HuggingFacePipeline(pipeline=llm_pipeline)


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


#Use It in the RetrievalQA Chain

In [95]:
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

# 5. Run the model manually
response = llm.invoke(final_prompt)

# 6. Print result
print("Answer:", response)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer: ranging from traditional image pro- cessing to advanced deep learning approaches.


# so this is how the working is done and we ge the answers

# Now the complete code creation for the full model working in proper chain

In [7]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate

# Step 1: Ask user for PDF path
pdf_path = input("Enter the full path to your PDF file: ").strip()

# Step 2: Load and check number of pages
if not os.path.isfile(pdf_path):
    print("❌ File not found! Please check the path.")
    exit()

loader = PyPDFLoader(pdf_path)
all_pages = loader.load()

if len(all_pages) > 20:
    print("❌ PDF is too long! Please upload a PDF with 20 pages or fewer.")
    exit()

# Step 3: Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(all_pages)

# Step 4: Create sentence embeddings
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embedding_model)

# Step 5: Create retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# Step 6: Load LLM (Flan-T5)
llm_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_length=512,
    temperature=0
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

# Step 7: Prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Answer the question based on the context below.
If the answer isn't in the context, say you don't know.

Context: {context}

Question: {question}

Answer:
"""
)

# Step 8: Question answering function
def ask_question(question):
    retrieved_docs = retriever.invoke(question)
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    final_prompt = prompt_template.format(context=context_text, question=question)
    response = llm.invoke(final_prompt)
    print("\n📌 Answer:", response)

# Step 9: User interaction loop
if __name__ == "__main__":
    print("\n✅ PDF loaded successfully! You can now ask questions about it.")
    while True:
        user_input = input("\n📝 Ask a question (or type 'exit' to quit): ")
        if user_input.lower() == "exit":
            print("👋 Exiting...")
            break
        ask_question(user_input)


Enter the full path to your PDF file: /content/drive/MyDrive/ask my pdf/defect detection in fabric.pdf


  embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  llm = HuggingFacePipeline(pipeline=llm_pipeline)



✅ PDF loaded successfully! You can now ask questions about it.

📝 Ask a question (or type 'exit' to quit): what is the project report name?


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: Defect Detection in Fabric

📝 Ask a question (or type 'exit' to quit): who created it


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: Dhruv Hemal Shah

📝 Ask a question (or type 'exit' to quit): what is the guide name


  return forward_call(*args, **kwargs)
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: Dr. Ritu Tiwari

📝 Ask a question (or type 'exit' to quit): what is the method used


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: iterative transfer learning

📝 Ask a question (or type 'exit' to quit): can you explain the iteraetive transfer learning


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: yes

📝 Ask a question (or type 'exit' to quit): please explain the iterative transfer learning


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: Iterative transfer learning involves gradually in- troducing new classes of defects to the model in a sequential manner. This strategy enables the model to adapt incrementally without forgetting previously learned classes—a challenge known as catastrophic forgetting in deep learning.

📝 Ask a question (or type 'exit' to quit): what are the defects detected in fabric


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: Stains, Tear Cut, Cut, Holes and overall performance

📝 Ask a question (or type 'exit' to quit): quit


  return forward_call(*args, **kwargs)
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Answer: not in the context

📝 Ask a question (or type 'exit' to quit): exit
👋 Exiting...
