In [None]:
!pip install transformers langchain pypdf2 faiss-cpu sentence-transformers torch nltk scikit-learn



In [None]:
!pip install -U langchain-community



In [None]:
!pip install pypdf



In [None]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer,  AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from google.colab import files
from typing import List, Dict
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class Orb:
    def __init__(self):
        # Initialize main QA model
        self.model_name = "google/flan-t5-large"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

        # Initialize verification model
        self.verifier_name = "facebook/bart-large-mnli"
        self.verifier_tokenizer = AutoTokenizer.from_pretrained(self.verifier_name)
        self.verifier_model = AutoModelForSequenceClassification.from_pretrained(self.verifier_name)

        # Initialize embeddings
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )

        self.vector_store = None
        self.uploaded_pdfs = {}

    def calculate_confidence(self, answer: str, context: str, question: str) -> float:
        """Calculate confidence score"""
        inputs = self.verifier_tokenizer(context,
                                       f"{question} {answer}",
                                       return_tensors="pt",
                                       truncation=True,
                                       max_length=512)

        with torch.no_grad():
            outputs = self.verifier_model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        nli_score = probs[0][2].item()

        answer_embedding = self.embeddings.embed_query(answer)
        context_embedding = self.embeddings.embed_query(context)
        relevance_score = cosine_similarity([answer_embedding], [context_embedding])[0][0]

        final_score = (0.6 * nli_score + 0.4 * relevance_score)
        return final_score

    def upload_pdfs(self):
        """Upload multiple PDF files and process them"""
        print("Please upload your PDF files...")
        uploaded = files.upload()

        new_texts = []
        for filename, content in uploaded.items():
            if filename.endswith('.pdf'):
                try:
                    self.uploaded_pdfs[filename] = {
                        'size': len(content),
                        'processed': True
                    }

                    loader = PyPDFLoader(filename)
                    pages = loader.load()

                    for page in pages:
                        page.metadata['source'] = filename
                        page.metadata['page'] = page.metadata.get('page', 0) + 1

                    new_texts.extend(pages)
                    print(f"Processed: {filename}")
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
                    continue

        if not new_texts:
            print("No PDF files were uploaded or processed.")
            return

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,
            chunk_overlap=100,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            length_function=len
        )
        texts = text_splitter.split_documents(new_texts)

        if self.vector_store is None:
            self.vector_store = FAISS.from_documents(texts, self.embeddings)
        else:
            self.vector_store.add_documents(texts)

        print(f"\nProcessed {len(self.uploaded_pdfs)} PDFs successfully!")
        self.list_uploaded_pdfs()

    def list_uploaded_pdfs(self):
        """Display all uploaded PDFs"""
        print("\nUploaded PDFs:")
        for idx, (filename, metadata) in enumerate(self.uploaded_pdfs.items(), 1):
            print(f"{idx}. {filename}")

    def generate_answers(self, question: str, context: str) -> Dict[str, str]:
        """Generate both specific and detailed answers"""
        # Generate specific answer
        specific_prompt = f"Question: {question}\nContext: {context}\nProvide a direct, one-sentence answer:"
        specific_inputs = self.tokenizer(specific_prompt,
                                       max_length=1024,
                                       truncation=True,
                                       return_tensors="pt")

        with torch.no_grad():
            specific_outputs = self.model.generate(
                specific_inputs.input_ids,
                max_length=50,
                min_length=10,
                num_beams=4,
                length_penalty=1.0,
                early_stopping=True
            )

        specific_answer = self.tokenizer.decode(specific_outputs[0], skip_special_tokens=True)

        # Generate detailed answer
        detailed_prompt = f"Question: {question}\nContext: {context}\nProvide a detailed explanation:"
        detailed_inputs = self.tokenizer(detailed_prompt,
                                       max_length=1024,
                                       truncation=True,
                                       return_tensors="pt")

        with torch.no_grad():
            detailed_outputs = self.model.generate(
                detailed_inputs.input_ids,
                max_length=200,
                min_length=50,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )

        detailed_answer = self.tokenizer.decode(detailed_outputs[0], skip_special_tokens=True)

        return {
            "specific": specific_answer,
            "detailed": detailed_answer
        }

    def answer_question(self, question: str, k: int = 5) -> Dict:
        """Generate both specific and detailed answers with confidence"""
        if self.vector_store is None:
            return {
                "specific_answer": "Please upload PDFs first.",
                "detailed_answer": "Please upload PDFs first.",
                "sources": [],
                "confidence": 0.0
            }

        relevant_docs = self.vector_store.similarity_search(question, k=k)
        sources = [{"file": doc.metadata['source'],
                   "page": doc.metadata['page']} for doc in relevant_docs]

        context = " ".join([doc.page_content.strip() for doc in relevant_docs])

        answers = self.generate_answers(question, context)
        confidence = self.calculate_confidence(answers["specific"], context, question)

        return {
            "specific_answer": answers["specific"],
            "detailed_answer": answers["detailed"],
            "sources": sources,
            "confidence": round(confidence * 100, 2)
        }

def main():
    print("Initializing Orb...")
    orb = Orb()

    while True:
        print("\nOptions:")
        print("1. Upload PDFs")
        print("2. List uploaded PDFs")
        print("3. Ask a question")
        print("4. Exit")

        choice = input("Enter your choice (1-4): ")

        if choice == "1":
            orb.upload_pdfs()
        elif choice == "2":
            orb.list_uploaded_pdfs()
        elif choice == "3":
            if orb.vector_store is None:
                print("Please upload PDFs first!")
                continue

            question = input("Enter your question: ")
            result = orb.answer_question(question)

            print("\n=== Quick Answer ===")
            print(result["specific_answer"])

            print("\n=== Detailed Answer ===")
            print(result["detailed_answer"])

            print(f"\nConfidence: {result['confidence']}%")

            print("\nSources:")
            for source in result['sources']:
                print(f"- {source['file']}, Page {source['page']}")

        elif choice == "4":
            print("Goodbye!")
            break
        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    main()

Initializing Orb...

Options:
1. Upload PDFs
2. List uploaded PDFs
3. Ask a question
4. Exit
Enter your choice (1-4): 1
Please upload your PDF files...


Saving AI.pdf to AI (3).pdf
Processed: AI (3).pdf

Processed 1 PDFs successfully!

Uploaded PDFs:
1. AI (3).pdf

Options:
1. Upload PDFs
2. List uploaded PDFs
3. Ask a question
4. Exit
Enter your choice (1-4): 3
Enter your question: what is Unsupervised Machine Learning?

=== Quick Answer ===
They discover hidden patterns in data without the need for human intervention

=== Detailed Answer ===
These algorithms can analyze and cluster unlabeled data sets. They discover hidden patterns in data without the need for human intervention (hence, they are “unsupervised”) These algorithms can analyze and cluster unlabeled data sets. They discover hidden patterns in data without the need for human intervention

Confidence: 52.19%

Sources:
- AI (3).pdf, Page 23
- AI (3).pdf, Page 25
- AI (3).pdf, Page 23
- AI (3).pdf, Page 24
- AI (3).pdf, Page 23

Options:
1. Upload PDFs
2. List uploaded PDFs
3. Ask a question
4. Exit
Enter your choice (1-4): 4
Goodbye!


In [None]:
if __name__ == "__main__":
    main()

Initializing Orb...

Options:
1. Upload PDFs
2. List uploaded PDFs
3. Ask a question
4. Exit
Enter your choice (1-4): 1
Please upload your PDF files...


Saving DeepAi.pdf to DeepAi (1).pdf
Processed: DeepAi (1).pdf

Processed 1 PDFs successfully!

Uploaded PDFs:
1. DeepAi (1).pdf

Options:
1. Upload PDFs
2. List uploaded PDFs
3. Ask a question
4. Exit
Enter your choice (1-4): 1
Please upload your PDF files...


Saving Ai_Ml.pdf to Ai_Ml (1).pdf
Processed: Ai_Ml (1).pdf

Processed 2 PDFs successfully!

Uploaded PDFs:
1. DeepAi (1).pdf
2. Ai_Ml (1).pdf

Options:
1. Upload PDFs
2. List uploaded PDFs
3. Ask a question
4. Exit
Enter your choice (1-4): 3
Enter your question: what is backpropagation?

=== Quick Answer ===
a practical application of the chain rule for derivatives

=== Detailed Answer ===
The backpropagation procedure to compute the gradient of an objective function with respect to the weights of a multilayer stack of modules is nothing more than a practical application of the chain rule for derivatives. subsequent module) . The backpropagation equation can be applied repeatedly to propagate gradients through all modules, starting from the output at the top (where the network produces its prediction) all the way to the bottom (where backpropagated gradients either grow or shrink at each time step, so over many time steps they typically explode or vanish. IV. COMPARISON BETWEEN MACHINE