In [None]:
!pip install pdfplumber transformers torch

import pdfplumber
from transformers import pipeline, BartTokenizer
import torch
import time

# Check for GPU availability
torch_device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the Champion Model (Improved Extractor Model)
extractor_model = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",  # Using a free model instead
    tokenizer="facebook/bart-large-cnn",
    max_length=250,  # Increased max tokens
    min_length=50,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    framework="pt",
    device=0 if torch.cuda.is_available() else -1
)

# Load the Challenger Model (Zero-Shot Classification for Evaluation)
evaluator_model = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-zeroshot-v1",  # Efficient for classification
    framework="pt",
    device=0 if torch.cuda.is_available() else -1
)

# Load tokenizer for BART
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using pdfplumber."""
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    return text if text else "No text extracted from PDF."

def split_text_into_chunks(text, max_words=700, overlap=100):
    """Splits large text into overlapping chunks to retain context."""
    words = text.split()
    chunks = [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words - overlap)]
    return chunks

def champion_model(input_text, additional_context=None):
    """
    Extract relevant information using a summarization model.
    Ensures that input text does not exceed the model's max token limit.
    """
    full_input = additional_context + "\n" + input_text if additional_context else input_text

    # Tokenize and truncate if necessary
    tokens = tokenizer(full_input, return_tensors="pt", truncation=True, max_length=1024)
    truncated_text = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)

    # Generate summary with extended length for better response
    summary = extractor_model(truncated_text, max_length=250, min_length=80, do_sample=True, top_k=50, top_p=0.95)
    return summary[0]['summary_text']

def challenger_model(champion_response):
    """Challenger model evaluates the extracted text and provides critique."""
    labels = [
        "highly accurate and relevant", "contains minor inconsistencies", "lacks clarity",
        "contains irrelevant content", "needs significant improvement"
    ]
    result = evaluator_model(champion_response, candidate_labels=labels)

    best_label = result["labels"][0]
    score = result["scores"][0]

    if best_label == "highly accurate and relevant" and score > 0.75:
        rating = 9
        feedback = "The extracted information is well-structured and relevant."
    elif best_label == "contains minor inconsistencies":
        rating = 7
        feedback = "There are minor inconsistencies. Consider refining key points."
    elif best_label == "lacks clarity":
        rating = 6
        feedback = "The response could be clearer and better structured."
    elif best_label == "contains irrelevant content":
        rating = 5
        feedback = "Contains irrelevant content. Focus more on extracting key details."
    else:
        rating = 4
        feedback = "Needs significant improvement. Refine and focus more on relevance."

    return rating, feedback

def champion_challenger_pipeline(pdf_path, max_iterations=3):
    """Pipeline to extract text, refine it iteratively, and return the best response."""
    extracted_text = extract_text_from_pdf(pdf_path)
    if extracted_text == "No text extracted from PDF.":
        print("Error: PDF contains no extractable text.")
        return ""

    text_chunks = split_text_into_chunks(extracted_text, max_words=700, overlap=100)
    final_responses = []

    for i, chunk in enumerate(text_chunks):
        print(f"\nProcessing Chunk {i+1}/{len(text_chunks)}...")
        iteration = 0
        additional_context = ""

        while iteration < max_iterations:
            iteration += 1
            print(f"\nIteration {iteration}: Champion Model Processing...\n")
            champion_response = champion_model(chunk, additional_context)
            print("Champion Model Response:", champion_response)

            rating, feedback = challenger_model(champion_response)
            print(f"Challenger Model Rating: {rating}/10")
            print(f"Challenger Feedback: {feedback}\n")

            if rating >= 8:
                print("Final refined response for this chunk accepted.")
                final_responses.append(champion_response)
                break

            additional_context = feedback  # Use feedback as guidance
            time.sleep(2)  # Prevent excessive looping

        if iteration == max_iterations:
            print("Max iterations reached. Using best available response for this chunk.")
            final_responses.append(champion_response)

    final_output = "\n\n".join(final_responses)
    return final_output

# Set your PDF file path
pdf_path = "/content/Ashish_22ucc026.pdf"  # Replace with actual file

# Run the pipeline
final_response = champion_challenger_pipeline(pdf_path)

# Print the extracted relevant information
print("\nFinal Extracted Information:")
print(final_response)




Device set to use cpu
Device set to use cpu



Processing Chunk 1/8...

Iteration 1: Champion Model Processing...

Champion Model Response: This paper presents a comprehensive technical analysis of application patterns and use cases. We evaluate existing FaaS practitioners. platforms, both commercial offerings and open-source solutions, comparing theirperformancecharacteristics,scalabilityfeatures, and development workflows. We examine emerging trends including edge-based FAAS deployment, hybrid archi- tectures, and specialized hardware acceleration for serverless A. Execution Model and Infrastructure workloads. Through experimental evaluation and case studies, we proposeimplementationstrategiesforefficientFaaSadoption.
Challenger Model Rating: 6/10
Challenger Feedback: The response could be clearer and better structured.


Iteration 2: Champion Model Processing...

Champion Model Response: Function-as-a-Service (FaaS) represents one of the most significant paradigm shifts in this evolution. Cloud computing continues to evolve rap