In [1]:
%pip install -U langchain-community pypdf
%pip install -U langchain-community pypdf
!pip install unstructured
!pip install langchain
!pip install -q --upgrade \
  google-generativeai \
  gradio \
  langchain \
  langchain-community \
  faiss-cpu \
  pypdf \
  sentence-transformers \
  unstructured[pptx]

In [None]:
import zipfile
import os

zip_file_path = "/content/data_base_pdf.zip"
extract_path = "/content/extracted_data"

# Create the extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Extract the zip file
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Files extracted to: {extract_path}")
except zipfile.BadZipFile:
    print(f"Error: The file at {zip_file_path} is not a valid zip file.")
except FileNotFoundError:
    print(f"Error: The file at {zip_file_path} was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Files extracted to: /content/extracted_data


In [None]:
#load pdf documents
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
data_path="/content/extracted_data"
#function to load pdfs from the subjects floader
def load_pdf_files(data_path):
    loader=DirectoryLoader(data_path,glob="*pdf",loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents
#loading those files from floder
def load_subject(data_path):
    subject_data={}
    for subject in os.listdir(data_path):
      subject_path=os.path.join(data_path,subject)
      if os.path.isdir(subject_path):
        subject_documents=load_pdf_files(subject_path)
        subject_data[subject]=subject_documents
    return subject_data

subject_data=load_subject(data_path)

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPowerPointLoader

# ✅ Folder that contains subject folders
data_base_path = "/content/extracted_data/data_base_pdf"

# ✅ This will hold all subject documents
subject_documents = {}

# ✅ Loop through each subject folder
for subject_name in os.listdir(data_base_path):
    subject_path = os.path.join(data_base_path, subject_name)

    # Make sure it's a folder
    if os.path.isdir(subject_path):
        print(f"\n📚 Loading subject: {subject_name}")
        docs = []

        # ✅ Loop through each file inside the subject folder
        for file_name in os.listdir(subject_path):
            file_path = os.path.join(subject_path, file_name)

            try:
                # ✅ If it's a PDF file
                if file_name.endswith(".pdf"):
                    loader = PyPDFLoader(file_path)
                    file_docs = loader.load()
                    docs.extend(file_docs)
                    print(f"  📄 Loaded PDF: {file_name} ({len(file_docs)} pages)")

                # ✅ If it's a PowerPoint file
                elif file_name.endswith(".pptx") or file_name.endswith(".ppt"):
                    loader = UnstructuredPowerPointLoader(file_path)
                    ppt_docs = loader.load()
                    docs.extend(ppt_docs)
                    print(f"  🖼️ Loaded PPT: {file_name} ({len(ppt_docs)} slides)")

            except Exception as e:
                print(f"  ❌ Failed to load {file_name}: {str(e)}")

        # ✅ Store all loaded documents for this subject
        subject_documents[subject_name] = docs

# ✅ Final check
print("\n✅ All subjects loaded!\n")
for subject, docs in subject_documents.items():
    print(f"{subject}: {len(docs)} documents")



📚 Loading subject: big_data
  🖼️ Loaded PPT: Chapter 5.pptx (1 slides)




  🖼️ Loaded PPT: Chapter 4.pptx (1 slides)




  📄 Loaded PDF: UNIT-1 BDA R21 A7902 ABP.pdf (17 pages)
  📄 Loaded PDF: Unit-2 Hadoop BDA R21 A7902 ABP.pdf (18 pages)
  🖼️ Loaded PPT: Chapter 2.pptx (1 slides)
  🖼️ Loaded PPT: Chapter 1.pptx (1 slides)
  🖼️ Loaded PPT: Chapter 3.pptx (1 slides)

📚 Loading subject: pr
  📄 Loaded PDF: PYTHON PROGRAMMING NOTES.pdf (142 pages)
  📄 Loaded PDF: UNIT-3 (PR).pdf (14 pages)
  📄 Loaded PDF: Pattern Classification by Richard O. Duda, David G. Stork, Peter E.Hart .pdf (738 pages)
  📄 Loaded PDF: 273_PATTERN RECOGNITION.pdf (139 pages)
  📄 Loaded PDF: UNIT-4.pdf (22 pages)

📚 Loading subject: ccv
  📄 Loaded PDF: CCV UNIT-I.pdf (26 pages)
  📄 Loaded PDF: UNIT-V Cloud Computing and its Applications.pdf (11 pages)
  📄 Loaded PDF: CCV UNIT-II.pdf (13 pages)
  📄 Loaded PDF: UNIT-IV Secure distributed data storage in Cloud.pdf (26 pages)
  📄 Loaded PDF: UNIT-III Virtualization.pdf (32 pages)

✅ All subjects loaded!

big_data: 40 documents
pr: 1055 documents
ccv: 108 documents


In [None]:
#creating chunks with overlap
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

def create_chunks(documents):
     text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
     text_chunks = []
     for doc in documents:
       # Ensure we are splitting the string content of the document
       if isinstance(doc, Document):
         text_chunks.extend(text_splitter.split_text(doc.page_content))
       elif isinstance(doc, str):
         text_chunks.extend(text_splitter.split_text(doc))
     return text_chunks

for subject,documents in subject_documents.items():
  # create chunks for each subject's documents
  subject_documents[subject] = create_chunks(documents)

# Optional: Print information about the created chunks
print("\nCreated chunks for each subject:")
for subject, chunks in subject_documents.items():
  print(f"  Subject: {subject}, Number of chunks: {len(chunks)}")


Created chunks for each subject:
  Subject: big_data, Number of chunks: 224
  Subject: pr, Number of chunks: 4582
  Subject: ccv, Number of chunks: 599


In [None]:
#load the chunks into faiss
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
e_m=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
for subject,chunks in subject_documents.items():
  vector_store=FAISS.from_texts(chunks,e_m)
  vector_store.save_local(subject)

In [None]:
#after retriving connect to the gemenai api key and refine the docs
from langchain.chains import RetrievalQA
import google.generativeai as genai

# Paste your API key here
genai.configure(api_key="AIzaSyDQS3Eh07DJoI4yBiCPInLBtIfyg8UDiRk")

# Initialize Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")




In [None]:
import os
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Initialize embedding model (must match the one used during saving)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Set base directory for FAISS indexes
data_path = "/content"
# /content/big_data/index.faiss

# Function to load the FAISS index for a given subject
def load_faiss_database(subject):
    index_dir = os.path.join(data_path, f"{subject}")  # no faiss_store_ prefix
    if os.path.exists(os.path.join(index_dir, "index.faiss")) and os.path.exists(os.path.join(index_dir, "index.pkl")):
        faiss_db = FAISS.load_local(index_dir, embedding_model, allow_dangerous_deserialization=True)
        return faiss_db
    else:
        return None


# Function to perform similarity search
def query_faiss(query, faiss_db):
    similar_docs = faiss_db.similarity_search(query, k=3)
    return similar_docs

# Ask user to select a subject
subject_choice = input("Select subject: ").strip().lower()

# Load corresponding FAISS database
faiss_db = load_faiss_database(subject_choice)

if faiss_db:
    print(f"FAISS vector store loaded for subject: {subject_choice}\n")
    while True:
        user_query = input("Ask a question (or type 'exit' to quit): ").strip()
        if user_query.lower() == "exit":
            print("Exiting.")
            break
        similar_docs = query_faiss(user_query, faiss_db)
        # Step 3: Combine context
        context = "\n\n".join([doc.page_content for doc in similar_docs]) # Extract page_content from Document objects

        # Step 4: Ask Gemini to answer using context
        prompt = f"""
        Based on the following reference material, answer the user's question.
        if it not contain provide your answer

        Reference:
        {context}

        Question: {user_query}
        """
        response = model.generate_content(prompt)
        print(response.text)
else:
    print(f"No FAISS index found for subject '{subject_choice}'.")

Select subject: pr
FAISS vector store loaded for subject: pr

Ask a question (or type 'exit' to quit): what is pattern recognize
The reference material does not explicitly define "pattern recognition." However, it implies that pattern recognition involves:

1.  **Achieving a "good" representation** of patterns through careful feature selection and extraction. This representation should reveal structural relationships and allow for the expression of the underlying model of the patterns.
2.  **Classification:** Identifying the class to which a given pattern belongs, often using training data with known class labels to classify test patterns.

Ask a question (or type 'exit' to quit): exit
Exiting.
