In [11]:
# from google.colab import drive
# drive.mount('/content/drive')

In [12]:
pip install sentence-transformers faiss-cpu PyMuPDF




In [13]:
import os
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import PyPDF2
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from transformers import pipeline

# Step 1: Extract Text from PDF File
def extract_text_from_pdf(pdf_file):
    text = ""
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()  # Extract text from each page
    return text

# Step 2: Clean and Preprocess the Text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# Step 3: Split Text into Sentences
def split_text(text):
    return sent_tokenize(text)  # Split text into sentences using NLTK

# Step 4: Create Embeddings for each sentence/paragraph
def create_embeddings(text_chunks, model):
    embeddings = model.encode(text_chunks, convert_to_numpy=True)  # Create embeddings for each text chunk
    return embeddings


In [15]:
# Step 5: Build FAISS Index with Cosine Similarity
def build_faiss_index(embeddings):
    dimension = embeddings.shape[1]  # Get the dimensionality of the embeddings
    index = faiss.IndexFlatIP(dimension)  # Using Inner Product (Cosine Similarity)
    faiss.normalize_L2(embeddings)  # Normalize embeddings for cosine similarity
    index.add(embeddings)  # Add the embeddings to the index
    return index

In [26]:
# Step 6: Query and Retrieve Relevant Sentences
def retrieve_similar_sentences(query, model, index, text_chunks, threshold=0.3):
    query_embedding = model.encode([query], convert_to_numpy=True)  # Create an embedding for the query
    faiss.normalize_L2(query_embedding)  # Normalize the query embedding for cosine similarity
    distances, indices = index.search(query_embedding, 3)  # Search for the top 3 nearest embeddings
    results = [text_chunks[i] for i, distance in zip(indices[0], distances[0]) if distance > threshold]
    return results if results else ["No relevant sentences found."]


In [27]:
# Step 7: Curate Response Based on System Message
def curate_response(similar_sentences, system_message):
    if system_message == "summarize":
        summarizer = pipeline("summarization")  # Use a summarization model
        summarized_text = summarizer(' '.join(similar_sentences), max_length=130, min_length=30, do_sample=False)
        return summarized_text[0]['summary_text']

    elif system_message == "detailed response":
        return '\n\n'.join(similar_sentences)

    elif system_message == "insights only":
        insights = [sentence for sentence in similar_sentences if sentence.startswith('-')]  # Extract bullet points
        return '\n'.join(insights) if insights else "No key insights found."

    else:
        return '\n\n'.join(similar_sentences)

In [29]:
def main():
    # Step 1: Extract text from a PDF file
    pdf_file = '/content/apple-products.pdf'
    raw_text = extract_text_from_pdf(pdf_file)

    # Step 2: Clean and preprocess the text
    cleaned_text = clean_text(raw_text)

    # Step 3: Split the text into sentences
    text_chunks = split_text(cleaned_text)

    # Step 4: Load the embedding model and create embeddings for each sentence
    model = SentenceTransformer('paraphrase-mpnet-base-v2')  # A larger, more robust model for better embeddings
    embeddings = create_embeddings(text_chunks, model)

    # Step 5: Build the FAISS index
    index = build_faiss_index(embeddings)

    print("Welcome to the RAG Bot!")
    print("Ask your question, or type 'exit' to quit.")
    system_message = 'detailed response'

    while True:
        query = input("\nYou: ")
        if query.lower() == 'exit':
            break

        # Step 6: Retrieve relevant sentences based on the query
        similar_sentences = retrieve_similar_sentences(query, model, index, text_chunks)

        # Step 7: Curate the response based on system message
        curated_response = curate_response(similar_sentences, system_message)

        # Display result
        print(f"Bot:\n{curated_response}\n")

if __name__ == "__main__":
    main()


Welcome to the RAG Bot!
Ask your question, or type 'exit' to quit.

You: what are Apple products?
Bot:
Apple and the Apple logo are trademarks of Apple Inc. registered in the U.S.and other countries and regions.

* Based on data reported to Apple by its suppliers.

We’re also finding new and better ways to get Apple products into people’s hands.


You: List the Aplle products
Bot:
Eligible products are those in a product category for which EPEAT registration exists, including workstations, desktops, laptops, displays, mobile phones, and tablets.

52 Breakdown of U.S. retail packaging by weight.

Our suppliers use this library to select the materials they use in our products.


You: what is IOS?
Bot:
iOS is a trademark or registered trademark of Cisco in the U.S. and other countries and is used under license.

The intended users of the assurance statement are the stakeholders of Apple.

The intended users of the assurance statement are the stakeholders of Apple .


You: Which Apple prod