In [None]:
pip install langchain langchain-community pdfplumber pymupdf faiss-cpu sentence-transformers



In [None]:
import os
import fitz  # PyMuPDF for PDF parsing
import pdfplumber
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        # Using PyMuPDF (fitz)
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text") + "\n"
    except Exception as e:
        print("Error with PyMuPDF, trying pdfplumber:", e)
        try:
            # Using pdfplumber as an alternative
            with pdfplumber.open(pdf_path) as pdf:
                text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        except Exception as e:
            print("Error extracting text from PDF:", e)
    return text

In [None]:
# Function to store text in FAISS vector database
def store_in_vector_db(text):
    # Split text into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_text(text)

    # Load embeddings model
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Store text in FAISS vector store
    vector_store = FAISS.from_texts(texts, embedding_model)

    # Save FAISS index for later use
    vector_store.save_local("faiss_index")
    print("Vector database saved!")


In [None]:
# Main function
def main():
    docx_path = "/content/drive/MyDrive/Practice_session/Bhavani GEN AI Updated Resume.docx"

    if not os.path.exists(docx_path):
        print("File not found!")
        return

    print("Extracting text from PDF...")
    text = extract_text_from_pdf(docx_path)

    if text:
        print("Text extracted successfully!")
        print("Storing in vector database...")
        store_in_vector_db(text)
    else:
        print("Failed to extract text.")

if __name__ == "__main__":
    main()


Extracting text from PDF...
Text extracted successfully!
Storing in vector database...
Vector database saved!


In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load FAISS index with safe deserialization
vector_store = FAISS.load_local(
    "faiss_index",
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    allow_dangerous_deserialization=True  # Add this line
)

# Test query
query = "what are the projects?"
results = vector_store.similarity_search(query, k=4)
print(results)

[Document(id='9d1a7bb9-175e-4f92-a7d1-09ef572fa2dc', metadata={}, page_content='Languages: Telugu, Hindi, English\nInterests: Craft Making, Listening Music.\nStrength: Leadership, Problem Solving Skills, Work Ethic, Team\nPlayer'), Document(id='c149a110-75ed-4607-a1b3-0b842805f1f5', metadata={}, page_content='BHAVANI AAGISETTI\n+91 7396836139 | bhavaniagisetti235@gmail.com\nwww.linkedin.com/in/bhavani-aagisetti-216436254|\nEducation\nANR Degree College\nB.Sc. Computers; GPA: 7.69% 2018 - 2021\nSri Chaitanya Junior College\nIntermediate; Grade: 72.70% 2016 - 2018\nZPH school\nSecondary Education; GPA: 9.5/10 2015 - 2016\nSKILLS SUMMARY\nProgramming Languages: C, Python\nDeep Learning Architectures: Neural Networks\nData Processing: Pandas, NumPy, SQL\nMachine Learning Frameworks: TensorFlow, Py Torch, Kera’s'), Document(id='c4a69589-4f5f-4aa3-a6fc-5e985d5aa2cc', metadata={}, page_content='WORK EXPERIENCE\nValue Soft Tech Solutions July 2021 – Present\nGenerative AI Engineer\nOverall exp