## Summarization processor- single file


In [1]:
#installing libraries and modules

from typing import Optional
import vertexai
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1beta3 as documentai
from typing import Optional
import json
import os
from google.oauth2 import service_account
from IPython.display import Markdown as md

In [None]:
credential = service_account.Credentials.from_service_account_file("service_account.json")

PROJECT_ID = "your project id"  # @param {type:"string"}
vertexai.init(project=PROJECT_ID, credentials=credential)

In [None]:
project_id = "your project id"  # Replace with your Google Cloud project ID
location = "us" # Format is "us" or "eu"
processor_id = "f9dfe3fa915572b9" # Create processor before running sample
file_path = "path/to/your/file.pdf"  # Path to the file you want to process
processor_version = "rc"
mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Optional. Processor version to use

In [4]:
vertexai.init(project=project_id, location='us-central1')

In [5]:
from vertexai.generative_models import GenerativeModel
model = GenerativeModel( "gemini-1.5-flash")

In [6]:
base_prompt="""
Task:
Translate the provided summary into the {language} in the form of a paragraph with atleast 10 lines. Ignore any bullet points in summary if present.

Instructions:
Language Verification:Check if the summary is already in the {language}.
Translation: If necessary, translate the summary into the {language}.
Output:Return the translated summary.

Example-

Input:
Summary: "The quick brown fox jumps over the lazy dog."
Language: Spanish

Output: 
El rápido zorro marrón salta sobre el perro perezoso.
"""

In [7]:
def process_document_summarizer_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> str:
    # For supported options, refer to:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1beta3/projects.locations.processors.processorVersions#summaryoptions
    summary_options = documentai.SummaryOptions(
        length=documentai.SummaryOptions.Length.BRIEF,
        format=documentai.SummaryOptions.Format.BULLETS,
    )

    properties = [
        documentai.DocumentSchema.EntityType.Property(
            name="summary",
            value_type="string",
            occurrence_type=documentai.DocumentSchema.EntityType.Property.OccurrenceType.REQUIRED_ONCE,
            property_metadata=documentai.PropertyMetadata(
                field_extraction_metadata=documentai.FieldExtractionMetadata(
                    summary_options=summary_options
                )
            ),
        )
    ]

    # Optional: Request specific summarization format other than the default
    # for the processor version.
    process_options = documentai.ProcessOptions(
        schema_override=documentai.DocumentSchema(
            entity_types=[
                documentai.DocumentSchema.EntityType(
                    name="summary_document_type",
                    base_types=["document"],
                    properties=properties,
                )
            ]
        )
    )

    # Online processing request to Document AI
    document = process_document(
        project_id,
        location,
        processor_id,
        processor_version,
        file_path,
        mime_type,
        process_options=process_options,
    )

    normalized_value = None

    for entity in document.entities:
        res = print_entity(entity)
        if res:
            normalized_value = res
            break

    return normalized_value


def print_entity(entity: documentai.Document.Entity) -> str:
    # Fields detected. For a full list of fields for each processor see
    # the processor documentation:
    # https://cloud.google.com/document-ai/docs/processors-list
    key = entity.type_

    # Some other value formats in addition to text are availible
    # e.g. dates: `entity.normalized_value.date_value.year`
    text_value = entity.text_anchor.content
    # confidence = entity.confidence
    normalized_value = entity.normalized_value.text
    # print(f"    * {repr(key)}: {repr(text_value)}({confidence:.1%} confident)")

    if normalized_value:
        normalized_value = str(normalized_value)

    return normalized_value


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)
    doc = result.document

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document

    return doc

In [12]:
language=input("Enter the language you want the summary in-")

In [13]:
a=str(process_document_summarizer_sample(project_id, location, processor_id, processor_version, file_path, mime_type) )


In [14]:
a

' • The document provides instructions on how to compile a .tex file to a .pdf file using pdflatex, including necessary tools and steps.\n\n• It also offers guidance on writing a document using LaTeX, including creating chapters, spell-checking, and incorporating PDF-specific packages.'

In [15]:
final_prompt=base_prompt.format(language=language)+a
response = model.generate_content(final_prompt)


In [16]:
md(response.text)

यह दस्तावेज़ pdflatex का उपयोग करके एक .tex फ़ाइल को .pdf फ़ाइल में संकलित करने के तरीके के बारे में निर्देश प्रदान करता है, जिसमें आवश्यक उपकरण और चरण शामिल हैं। इसमें LaTeX का उपयोग करके दस्तावेज़ लिखने के बारे में मार्गदर्शन भी शामिल है, जिसमें अध्याय बनाना, वर्तनी जांचना और PDF-विशिष्ट पैकेजों को शामिल करना शामिल है।
