# Question Answering


In [None]:
%pip install transformers -U
%pip install sentencepiece
%pip install Pillow
%pip install torch
%pip install numpy
%pip install matplotlib
%pip install tqdm
%pip install torchtext
%pip install torchsummary
%pip install torchviz
%pip install tensorboard
%pip install tensorboardX
%pip install torchmetrics
%pip install pytorch-lightning


In [5]:

from langchain_community.llms import HuggingFaceHub
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from getpass import getpass

# Prompt the user to securely input the Hugging Face API token
api_token = getpass("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")

### DQA

##### microsoft/layoutlmv2-base-uncased

In [13]:
from transformers import pipeline

# Set up pipeline with API key
generator = pipeline('text-generation',
                     api_key='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')

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


#### Key Concepts in the Code
###### OCR with Pytesseract:
Extracts words and their positions from the image.

###### Bounding Box Normalization:
Ensures coordinates match LayoutLMv2’s expected input range.

###### Tokenization and Encoding:
Prepares the image and text for input into the transformer model.

###### Answer Extraction:
Identifies the most probable span of text corresponding to the answer.

###### Pretrained Model:
Leverages LayoutLMv2’s capabilities to process structured documents and handle layout-aware questions.

In [None]:
from transformers import LayoutLMv2ForQuestionAnswering, LayoutLMv2Processor
from PIL import Image
import torch

# Load the model
model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")

# Load the document image
image_path = "/home/vai/Desktop/chat_bot_experiment_one/cloud/test.png"  # Replace with your document image path
image = Image.open(image_path).convert("RGB")


# Define the question and context (text from the document)
question = "waht is langChan format ?"


# Define some example text and bounding boxes
text = ["What", "is", "LangChain", "format", "?"]
bbox = [[50, 50, 150, 100], [160, 50, 240, 100], [250, 50, 350, 100], [360, 50, 460, 100], [470, 50, 520, 100]]

# Process the inputs (image, text, and bounding boxes)
encoded_inputs = processor(image, text,return_tensors="pt", truncation=True)



# Get predictions
outputs = model(**encoded_inputs)



# Extract the start and end logits for the answer
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the answer tokens
start_index = torch.argmax(start_logits, dim=1).item()
end_index = torch.argmax(end_logits, dim=1).item()

# Decode the answer
tokens = encoded_inputs["input_ids"].squeeze()
answer = processor.tokenizer.decode(tokens[start_index:end_index + 1])

print(f"Question: {question}")
print(f"Answer: {answer}")


##### fine-tunned

In [28]:
from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering
from PIL import Image
import torch
import pytesseract

# Load model and processor
model = LayoutLMv2ForQuestionAnswering.from_pretrained("tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa")
processor = LayoutLMv2Processor.from_pretrained("tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa")

# Load image
image_path = "/home/vai/Desktop/chat_bot_experiment_one/cloud/test.png"
image = Image.open(image_path).convert("RGB")

# Extract words and bounding boxes using OCR
ocr_results = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
words = []
boxes = []

# Get image dimensions
image_width, image_height = image.size

for i in range(len(ocr_results['text'])):
    if ocr_results['text'][i].strip():  # Ignore empty text
        words.append(ocr_results['text'][i])
        
        # Extract bounding box coordinates
        x, y, w, h = (ocr_results['left'][i], ocr_results['top'][i],
                      ocr_results['width'][i], ocr_results['height'][i])

        # Normalize coordinates to 0–1000 range
        normalized_bbox = [
            int(1000 * (x / image_width)),
            int(1000 * (y / image_height)),
            int(1000 * ((x + w) / image_width)),
            int(1000 * ((y + h) / image_height))
        ]
        boxes.append(normalized_bbox)


# Question
question = "Tell me someting about covid?"

# Encode inputs with normalized bounding boxes
encoded_inputs = processor(
    image,
    words,
    boxes=boxes,  # Normalized bounding boxes
    return_tensors="pt",
    # truncation=True,
    
)

# Get predictions
outputs = model(**encoded_inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Extract the answer
start_index = torch.argmax(start_logits, dim=1).item()
end_index = torch.argmax(end_logits, dim=1).item()

tokens = encoded_inputs["input_ids"].squeeze()
answer = processor.tokenizer.decode(tokens[start_index:end_index + 1])

print(f"Question: {question}")
print(f"Answer: {answer}")


###### check quality

In [6]:
print(words)
print(boxes)

['[Document', '(page_content="This', 'transcript', 'is', 'provided', 'for', 'the', 'convenience', 'of', 'investors', 'only,', 'for', 'a', 'full', 'recording', 'pleas', 'e', 'see', 'the', 'Q4', '2021', 'Earnings', 'Call', 'webcast', '.\\n\\nAlphabet', 'Q4', '2021', 'Earnings', 'Call', 'February', '1,', '2022\\n\\nOperator:', 'Welcome', 'eve', 'ryone.', 'And', 'thank', 'you', 'for', 'standing', 'by', 'for', 'the', 'Alphabet', 'fourth', 'quarter', '2021', 'earnings', 'conference', 'call.', 'At', 'this', 'time,', 'all', 'participants', 'are', 'in', 'a', 'listen-only', 'mode.', 'After', 'the', 'speaker', 'presentation,', 'there', 'will', 'be', 'a', 'question', 'and', 'answer', 'session.', 'To', 'ask', 'a', 'question', 'during', 'the', 'session,', 'you', 'will', 'need', 'to', 'press', 'star', 'one', 'on', 'your', 'telephone.', 'If', 'you', 'require', 'any', 'further', 'a', 'ssistance,', 'please', 'press', 'star', 'zero.', 'I', 'would', 'now', 'like', 'to', 'hand', 'the', 'conference', 'over'

In [None]:
from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering
from PIL import Image
import torch

# Load the processor and modelprocessor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")

model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")

# Load an image
image = Image.open("cloud/test.png").convert("RGB")

# Define the question
question = "What is the question you want to ask?"

# Process the image and question
encoding = processor(image, question, return_tensors="pt")

# Forward pass
outputs = model(**encoding)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the most likely beginning and end of answer with the argmax of the logits
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert the tokens to the answer
all_tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze().tolist())
answer = " ".join(all_tokens[start_index:end_index+1])

print("Answer:", answer)

#### naver-clova-ix/donut-base


In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel, pipeline
from datasets import load_dataset
import torch
from PIL import Image

# # Load the processor and model
# processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
# model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

import re

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# load document image from the DocVQA dataset
dataset = load_dataset("hf-internal-testing/example-documents", split="test")
image = dataset[0]["image"]

# prepare decoder inputs
task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
question = "When is the coffee break?"
prompt = task_prompt.replace("{user_input}", question)
decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids

pixel_values = processor(image, return_tensors="pt").pixel_values

outputs = model.generate(
    pixel_values.to(device),
    decoder_input_ids=decoder_input_ids.to(device),
    max_length=model.decoder.config.max_position_embeddings,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    use_cache=True,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
)

sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
print(processor.token2json(sequence))


#### microsoft/layoutlm-base-uncased

In [9]:
from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering

# Load the processor and model
processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")

ImportError: 
LayoutLMv2Model requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


#### naver-clova-ix/donut-base-finetuned-docvqa

In [2]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image

# Load the processor and model
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

# Load an image
image = Image.open("cloud/test.png").convert("RGB")

# Define the question
question = "What is the question you want to ask?"

# Process the image and question
encoding = processor(image, question, return_tensors="pt")

# Forward pass
outputs = model(**encoding)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the most likely beginning and end of answer with the argmax of the logits
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert the tokens to the answer
all_tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze().tolist())
answer = " ".join(all_tokens[start_index:end_index+1])

print("Answer:", answer)

ValueError: Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.

### VQA

#### openai/clip-vit-base-patch32

In [56]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

image = Image.open("cloud/catandbaby.jpg")

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image 
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

print("Label probability of:", probs)

Label probability of: tensor([[0.9737, 0.0263]], grad_fn=<SoftmaxBackward0>)


#### Salesforce/blip2-flan-t5-xl

In [1]:
from PIL import Image
import requests
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

# device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16
)  # doctest: +IGNORE_RESULT

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)



# Example usage
# image = Image.open("cloud/catandbaby.jpg").convert("RGB")
image = Image.open("cloud/demo_pic.jpeg")



inputs = processor(images=image, return_tensors="pt")

outputs = model.generate(pixel_values=inputs["pixel_values"])
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]

print(generated_text)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.35s/it]


the poster for the movie frank vs frank


#### dandelin/vilt-b32-finetuned-vqa

In [9]:
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image

# Load the processor and model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Load an image
image = Image.open("cloud/test.png").convert("RGB")

# Define the question
question = "What is the question you want to ask?"

# Process the image and question
encoding = processor(image, question, return_tensors="pt")

# Forward pass
outputs = model(**encoding)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the most likely beginning and end of answer with the argmax of the logits
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert the tokens to the answer
all_tokens = processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze().tolist())
answer = " ".join(all_tokens[start_index:end_index+1])

print("Answer:", answer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AttributeError: 'SequenceClassifierOutput' object has no attribute 'start_logits'

#### openai/clip-vit-large-patch14

In [10]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# Load the processor and model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")

# Example usage
image = Image.open("cloud/catandbaby.jpg").convert("RGB")
inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the probabilities
print(probs)

tensor([[1.]], grad_fn=<SoftmaxBackward0>)


### QA

#### deepset/roberta-base-squad2

In [11]:
from transformers import pipeline

# Load the question answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Example usage
context = "Your context here."
question = "Your question here."
result = qa_pipeline(question=question, context=context)

print(f"Question: {question}")
print(f"Answer: {result['answer']}")

Device set to use cpu


Question: Your question here.
Answer: Your context here.


#### timpal0l/mdeberta-v3-base-squad2

In [12]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("timpal0l/mdeberta-v3-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("timpal0l/mdeberta-v3-base-squad2")

# Create a question answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Example usage
context = "Your context here."
question = "Your question here."
result = qa_pipeline(question=question, context=context)

print(f"Question: {question}")
print(f"Answer: {result['answer']}")

Device set to use cpu


Question: Your question here.
Answer:  context


### TQA

#### google/tapas-base-finetuned-wtq

In [28]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd

# Load the tokenizer and model
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")

# Example table
data = {
	"Actors": ["Brad Pitt", "Leonardo DiCaprio", "George Clooney"],
	"Number of movies": ["87", "53", "69"]
}
table = pd.DataFrame.from_dict(data)

# Example question
queries = ["How many movies has George Clooney played in?"]

# Tokenize the inputs
inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")

# Get the model outputs
outputs = model(**inputs)

# Convert the logits to predictions
logits = outputs.logits.detach()  # Detach the logits tensor
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
	inputs, logits
)

# Get the predicted answer
answer_coordinates = predicted_answer_coordinates[0]
answer = table.iat[answer_coordinates[0][0], answer_coordinates[0][1]]

print(f"Question: {queries[0]}")
print(f"Answer: {answer}")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


ValueError: not enough values to unpack (expected 2, got 1)

#### microsoft/tapex-large-finetuned-wtq

In [27]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd

# Load the tokenizer and model
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")

# Example table
data = {
	"Actors": ["Brad Pitt", "Leonardo DiCaprio", "George Clooney"],
	"Number of movies": ["87", "53", "69"]
}
table = pd.DataFrame.from_dict(data)

# Example question
queries = ["How many movies has George Clooney played in?"]

# Tokenize the inputs
inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")

# Get the model outputs
outputs = model(**inputs)

# Convert the logits to predictions
logits = outputs.logits.detach()  # Detach the logits tensor
predicted_answer_coordinates, _ = tokenizer.convert_logits_to_predictions(
	inputs, logits
)

# Check if the predicted answer coordinates are valid
if predicted_answer_coordinates and predicted_answer_coordinates[0]:
	answer_coordinates = predicted_answer_coordinates[0]
	answer = table.iat[answer_coordinates[0][0], answer_coordinates[0][1]]
else:
	answer = "No answer found"

print(f"Question: {queries[0]}")
print(f"Answer: {answer}")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


ValueError: not enough values to unpack (expected 2, got 1)

### Translation

#### Helsinki-NLP/opus-mt-en-de

In [22]:
from transformers import MarianMTModel, MarianTokenizer

# Load the model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Example usage
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
translated_tokens = model.generate(**inputs)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print(f"Translated text: {translated_text}")

ImportError: 
MarianTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


#### facebook/m2m100_418M

In [21]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load the model and tokenizer
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# Example usage
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
translated_tokens = model.generate(**inputs)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print(f"Translated text: {translated_text}")

ImportError: 
M2M100Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


#### google-t5/t5-small

In [20]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
model_name = "google/t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Example usage
text = "Translate English to German: Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Generated text: {generated_text}")

ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


#### Helsinki-NLP/opus-mt-zh-en

In [14]:
from transformers import MarianMTModel, MarianTokenizer

# Load the model and tokenizer
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Example usage
text = "你好，你怎么样？"
inputs = tokenizer(text, return_tensors="pt")
translated_tokens = model.generate(**inputs)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print(f"Translated text: {translated_text}")

ImportError: 
MarianTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


#### facebook/nllb-200-distilled-600M

In [19]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load the model and tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# Example usage
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt")
translated_tokens = model.generate(**inputs)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print(f"Translated text: {translated_text}")

ImportError: 
M2M100Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


### Summerization

#### facebook/bart-large-cnn

In [18]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Example usage
text = "Artificial intelligence (AI) has significantly transformed various industries over the past decade, offering innovative solutions to complex problems. From healthcare to finance, AI-powered tools have streamlined processes, improved decision-making, and enhanced customer experiences. For instance, machine learning algorithms are being used in hospitals to predict patient outcomes and recommend personalized treatment plans, while financial institutions leverage AI for fraud detection and algorithmic trading. Despite these advancements, challenges such as ethical considerations, data privacy concerns, and the need for transparency in AI decision-making remain critical issues. As technology evolves, addressing these challenges will be crucial to ensure the responsible and equitable use of AI in society."
inputs = tokenizer(text, return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"])
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(f"Summary: {summary}")

Summary: Artificial intelligence (AI) has significantly transformed various industries over the past decade. From healthcare to finance, AI-powered tools have streamlined processes, improved decision-making, and enhanced customer experiences. Despite these advancements, challenges such as ethical considerations, data privacy concerns, and the need for transparency in AI decision- making remain critical issues.


#### google/pegasus-multi_news

In [17]:
%pip install sentencepiece

from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the model and tokenizer
model_name = "google/pegasus-multi_news"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Example usage
text = "Your text here."
inputs = tokenizer(text, return_tensors="pt", truncation=True)
summary_ids = model.generate(inputs["input_ids"])
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(f"Summary: {summary}")

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


ImportError: 
PegasusTokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
