## Part 1: PDF to Text

### Part 1.a: Using PyPDF2

#### Import libraries

In [1]:
from PyPDF2 import PdfWriter, PdfReader
import os, errno
import PyPDF2
from subprocess import call
import sys

In [2]:
## Function to split the PDF input file
def split(directory, filename):
    inputpdf = PdfReader(open(filename, "rb"))
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    for i in range(len(inputpdf.pages)):
        output = PdfWriter()
        output.add_page(inputpdf.pages[i])
        with open(directory+ "/%s.pdf" % i, "wb") as outputStream:
            output.write(outputStream)
    print(output)

#### Extracting Text from 1 file

In [3]:
filename = "Dataset/2312.15012.pdf"
directory = "splitted/"+filename

In [4]:
split(directory, filename)
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)

extracted_text = ""
for i in range(len(pdfReader.pages)):
    splitted_file_name = directory + "/" + repr(i)
    #print("-------",splitted_file_name,"---------")
    pdfFileObj = open(splitted_file_name+".pdf", 'rb')
    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    text = pdfReader.pages[0].extract_text()
    #print(text)
    extracted_text += text

print(f"The Extracted Text from file is {len(extracted_text)} characters long!")

<PyPDF2._writer.PdfWriter object at 0x000002748D287AD0>
The Extracted Text from file is 60101 characters long!


#### Extracting Texts from manually downloaded PDFs

In [5]:
import os
import pandas as pd
files = [f for f in os.listdir("Dataset") if os.path.isfile(f)]

from pathlib import Path

pdf_files = Path("Dataset").glob("*.pdf")

df_manual = pd.DataFrame(columns = ['PDF','Text'])

for f in pdf_files:
    print("Filename is : ",f)
    directory = "splitted/"+filename
    split(directory, f)
    pdfFileObj = open(filename, 'rb')
    pdfReader = PyPDF2.PdfReader(pdfFileObj)

    extracted_text = ""
    for i in range(len(pdfReader.pages)):
        splitted_file_name = directory + "/" + repr(i)
        #print("-------",splitted_file_name,"---------")
        pdfFileObj = open(splitted_file_name+".pdf", 'rb')
        pdfReader = PyPDF2.PdfReader(pdfFileObj)
        text = pdfReader.pages[0].extract_text()
        #print(text)
        extracted_text += text

    print(f"Extracted text length is {len(extracted_text)}")

    df_manual.loc[len(df_manual)] = [f,extracted_text]

[]
Filename is :  Dataset\2312.15012.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A44EEE10>
Extracted text length is 60101
Filename is :  Dataset\2312.15018.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A3DDE850>
Extracted text length is 51730
Filename is :  Dataset\2312.15038.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A42D5090>
Extracted text length is 45931
Filename is :  Dataset\2312.15050.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A4605250>
Extracted text length is 70436
Filename is :  Dataset\2312.15056.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A3C2EF10>
Extracted text length is 41600
Filename is :  Dataset\2312.15483.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A45FE850>
Extracted text length is 35781
Filename is :  Dataset\2312.15761.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A4521A10>
Extracted text length is 40405
Filename is :  Dataset\2312.15766.pdf
<PyPDF2._writer.PdfWriter object at 0x00000274A423DC50>
Extracted text length is 268

In [6]:
df_manual.head()

Unnamed: 0,PDF,Text
0,Dataset\2312.15012.pdf,"DRAFT VERSION DECEMBER 27, 2023\nTypeset using..."
1,Dataset\2312.15018.pdf,"EA51CH25_Gabriel\nARjats.cls May 12, 2023 18:0..."
2,Dataset\2312.15038.pdf,Prepared for submission to JCAP\nConstraining ...
3,Dataset\2312.15050.pdf,ARTICLE\nLarge planets may not form fractional...
4,Dataset\2312.15056.pdf,arXiv:2312.15056v1 [astro-ph.CO] 22 Dec 2023...


### Part 1.b: Using EasyOCR

In [7]:
import fitz #PyMuPDF
import easyocr
from PIL import Image
import numpy as np
import os

In [8]:
def pdf_to_images(pdf_path):
    images = []
    pdf_document = fitz.open(pdf_path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        image = page.get_pixmap()
        img = Image.frombytes("RGB", (image.width, image.height), image.samples)
        images.append(img)

    pdf_document.close()
    return images

In [9]:
def extract_text_from_images(images, language='en'):
    reader = easyocr.Reader([language])

    extracted_text = ""
    for img in images:
        img_array = np.array(img)
        result = reader.readtext(img_array)
        for detection in result:
            text = detection[1]
            extracted_text += text + "\n"

    return extracted_text

In [10]:
def save_text_to_file(text, file_path):
    with open(file_path, 'a', encoding='utf-8') as text_file:
        text_file.write(text)

#### Extract Text with EasyOCR from 1 file

In [11]:
filename = "ExtraFiles/sample.pdf"
directory = "splitted/"+filename
directory_text = "textOCR/"

In [12]:
split(directory, filename)
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)

for i in range(len(pdfReader.pages)):
    splitted_file_name = directory + "/" + repr(i)
    print("-------",splitted_file_name,"---------")
    pdf_images = pdf_to_images(splitted_file_name+".pdf")
    print("Length of each pdf is :", len(pdf_images))

    extracted_text = extract_text_from_images(pdf_images)
    print("=========================================")
    #print(extracted_text)
    
    output_filename = directory_text+repr(i)+".txt"
    save_text_to_file(extracted_text, output_filename)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


<PyPDF2._writer.PdfWriter object at 0x00000274A9BCB410>
------- splitted/ExtraFiles/sample.pdf/0 ---------
Length of each pdf is : 1


### Part 1.c: Using TesseractOCR

In [13]:
import pytesseract
from PIL import Image
from PIL.JpegImagePlugin import JpegImageFile
import easyocr

In [14]:
def open_image(path: str) -> JpegImageFile:
    return Image.open(path)

In [15]:
def TesseractOCR(image: JpegImageFile) -> str:
    return pytesseract.image_to_string(image, lang="eng")

In [16]:
def EasyOCR(image: JpegImageFile) -> str:
    reader = easyocr.Reader(['en']) # initialize OCR
    result = reader.readtext(image) # input image
    return "\n".join([res[1] for res in result])

In [17]:
import fitz
import pytesseract
from PIL import Image


def pdf_to_images(pdf_path, output_folder):
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    pdf_document = fitz.open(pdf_path)
    extracted_text = ""
    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        image = page.get_pixmap()
        
        image_path = f"{output_folder}/page_{page_number + 1}.jpg"
        image.save(image_path)

        text = pytesseract.image_to_string(Image.open(image_path))
        extracted_text += text + "\n"
    
    pdf_document.close()
    return extracted_text

#### Extracting Text from 1 file

In [18]:
# Example usage:
text = pdf_to_images("ExtraFiles/automatic text summarization.pdf", "pdf2jpg")

In [19]:
print(f"Extracted text length is {len(text)}")

Extracted text length is 92200


In [20]:
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
print(pytesseract.image_to_string(Image.open('pdf2jpg\page_22.jpg')))

2 5 agua st i pain 15.202) 1257

ome the seman confusion caused by ambiguus orsynonymous
‘words (Wr etal2017) Without uring NUP te generated xtc
Se smmaties may sles om lack of caheson and semantics
‘a if texts canain mall tople, the generated surnry
‘ay not be balanced (Guts Lal 2010,

‘hllenges Relate wo Evaluation ofthe Generated Summary
valuaingsunumares eter automa or manual) ail
fairs tts very llengigtodein and we goad standard
{orevalate whether he summaries generated fom the ATS S/=
tems ae goad enough (Lotte 2017) a2) ery hard
{o'fed oat what a tel (or even covet) sunt) Hs becuse
fe ATS systems can generate good Sumsmavies that ate dierent
faethe human gencatd suas (Moraanch lila
2017) Humans ae diferent and they Sele ently erat
Seetences for the erative statis aed ay paapate the
Stacie summaries na completely dierent wa 1s ery ub
jective ta dents good smy Meer, anal esas
‘ay ot be stable for al yes of uma! el, 2017)
“here is ned to propose new aptaaces and selon fo the
ulate 

#### Extracting Text from all manually downloaded files

In [21]:
import os
import pandas as pd
files = [f for f in os.listdir("Dataset") if os.path.isfile(f)]
from pathlib import Path

pdf_files = Path("Dataset").glob("*.pdf")

df_manual_1 = pd.DataFrame(columns = ['PDF','Text'])

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


for f in pdf_files:
    print("Filename is : ",f)
    text = pdf_to_images(f, "pdf2jpg")
    #print("Text is : ",text)

    text2 = ""
    for img in os.listdir('pdf2jpg/'):
        #print("Image is \n",img)

        img_path = 'pdf2jpg/'+img
        text2 += pytesseract.image_to_string(Image.open(img_path))

    print(f"Extracted text length is {len(text2)}")

    df_manual_1.loc[len(df_manual_1)] = [f,text2]

Filename is :  Dataset\2312.15012.pdf
Extracted text length is 91894
Filename is :  Dataset\2312.15018.pdf
Extracted text length is 62539
Filename is :  Dataset\2312.15038.pdf
Extracted text length is 89613
Filename is :  Dataset\2312.15050.pdf
Extracted text length is 81765
Filename is :  Dataset\2312.15056.pdf
Extracted text length is 100000
Filename is :  Dataset\2312.15483.pdf
Extracted text length is 95009
Filename is :  Dataset\2312.15761.pdf
Extracted text length is 96581
Filename is :  Dataset\2312.15766.pdf
Extracted text length is 78141
Filename is :  Dataset\2312.15992.pdf
Extracted text length is 72725
Filename is :  Dataset\2312.16147.pdf
Extracted text length is 85607


In [22]:
df_manual_1

Unnamed: 0,PDF,Text
0,Dataset\2312.15012.pdf,3\n\n5\n\nastro-ph.GA] 22 Dec 20:\n\narXiv:231...
1,Dataset\2312.15018.pdf,iggy ANNUAL\nREVIEWS\n\nAnmual Review of Earth...
2,Dataset\2312.15038.pdf,23\n\nDec 20:\n\nDl\n\nv1 [astro-ph.CO]\n\n038...
3,Dataset\2312.15050.pdf,nature\n\nCOMMUNICATIONS\n\nARTICLE\nEEE orev\...
4,Dataset\2312.15056.pdf,2023\n\n6v1 [astro-ph.CO] 22 Dec\n\n5\n\n312.1...
5,Dataset\2312.15483.pdf,3v1\n\n4\n\n5\n\narXiv:2312.1\n\nObservational...
6,Dataset\2312.15761.pdf,23\n\nDec 20:\n\n25\n\nastro-ph.CO]\n\n76lvi |...
7,Dataset\2312.15766.pdf,2023\n\n5 Dec\n\nastro-ph.CO] 2\n\narXiv:2312....
8,Dataset\2312.15992.pdf,arXiv:2312.15992v1 [astro-ph.CO] 26 Dec 2023\n...
9,Dataset\2312.16147.pdf,023\n\n2\n\narXiv:2312.16147v1 [astro-ph.CO] 2...


## Part 2: Text Summarization

#### Part 2.a: BART Summarization

In [23]:
from pdfquery import PDFQuery

import warnings
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, BartForConditionalGeneration

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

#### Summarizing 1 paragraph

In [24]:
text_to_summarize = "We aim to create a Deep Learning-based Text Summarization and Question-Answering model. With the introduction and rapid adoption of Large Language Models, especially the widespread use of GPT4All, research into Text Summarization and Question Answering Systems (QAS) has increased multifold. While LLM-based applications like ChatGPT allows users to enter textual prompt, there is often restriction in terms of the number of words they can write to set the context and only premium versions of such applications permit users to enter multimodal input data like images, PDFs, et cetera. We aim to create a model that will allow for both text and PDF input and help users understand the document, summarize it, and ask follow-up questions out of their input text or PDF file. Text summarization is a valuable tool for condensing extensive raw data into human-readable information. It falls into the category of Extractive and Abstractive methods. Extractive methods of summarization minimize the burden of summarization by choosing from the actual text a subset of sentences that are relevant. Three key components in QA systems include question classification, information retrieval, and answer extraction. Question classification categorizes submitted questions based on their types, a critical role. Information retrieval is crucial in QA as it determines if correct answers exist in a document. Answer extraction seeks to retrieve the user's requested answer."

In [25]:
inputs = tokenizer(text_to_summarize, max_length=1024, return_tensors="pt", truncation=True)
#print(inputs)

# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=30, max_length=100)
y = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(y)

We aim to create a Deep Learning-based Text Summarization and Question-Answering model. The model will allow for both text and PDF input and help users understand the document, summarize it, and ask follow-up questions.


In [28]:
import torch
from rouge import Rouge

# Define the generated summary and the reference summary
generated_summary = y
reference_summary = "We have created a model which will allow users to enter PDF files as input and be able to summarize those file through Text Summarization and do Question Answering with those generated summaries."
# Initialize the ROUGE object
rouge = Rouge()
# Calculate ROUGE for the generated and reference summaries
scores = rouge.get_scores(generated_summary, reference_summary)
# Print the results
print(scores)

[{'rouge-1': {'r': 0.43333333333333335, 'p': 0.43333333333333335, 'f': 0.4333333283333334}, 'rouge-2': {'r': 0.125, 'p': 0.12121212121212122, 'f': 0.12307691807810672}, 'rouge-l': {'r': 0.36666666666666664, 'p': 0.36666666666666664, 'f': 0.36666666166666667}}]


#### Summarizing Manually Downloaded PDFs

In [29]:
df_summary = pd.DataFrame(columns=['PDF','Summary'])

for index,row in df_manual.iterrows():
    inputs = tokenizer(row['Text'], max_length=1024, return_tensors="pt", truncation=True)
    #print(inputs)

    # Generate Summary
    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=30, max_length=100)
    y = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    #print(y)
    print("1 File Done!")

    df_summary.loc[len(df_summary)] = [row['PDF'],y]

1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!


In [30]:
df_summary

Unnamed: 0,PDF,Summary
0,Dataset\2312.15012.pdf,Two Distinct Classes of Quiescent Galaxies at ...
1,Dataset\2312.15018.pdf,The Annual Review of Earth and Planetary Scien...
2,Dataset\2312.15038.pdf,Constraining general multi-field ...
3,Dataset\2312.15050.pdf,Large planets may not form fractionally largem...
4,Dataset\2312.15056.pdf,We discuss the present state and planned updat...
5,Dataset\2312.15483.pdf,Observational constraints on extended Proca-Nu...
6,Dataset\2312.15761.pdf,Higher order clustering of Ly αforest provides...
7,Dataset\2312.15766.pdf,Inflation with the Gauss-Bonnet term is a two-...
8,Dataset\2312.15992.pdf,The galaxy bispectrum in the Spherical Fourier...
9,Dataset\2312.16147.pdf,Theory of quantum field theory and cosmologica...


#### Part 2.b: BERT Summarization

In [31]:
from summarizer.bert import Summarizer

model = Summarizer()
model(text_to_summarize)

'We aim to create a Deep Learning-based Text Summarization and Question-Answering model. With the introduction and rapid adoption of Large Language Models, especially the widespread use of GPT4All, research into Text Summarization and Question Answering Systems (QAS) has increased multifold. We aim to create a model that will allow for both text and PDF input and help users understand the document, summarize it, and ask follow-up questions out of their input text or PDF file.'

#### Sentence BERT Summarization

In [32]:
from summarizer.sbert import SBertSummarizer

model = SBertSummarizer('paraphrase-MiniLM-L6-v2')
result = model(text_to_summarize, num_sentences=3)

In [33]:
result

'We aim to create a Deep Learning-based Text Summarization and Question-Answering model. With the introduction and rapid adoption of Large Language Models, especially the widespread use of GPT4All, research into Text Summarization and Question Answering Systems (QAS) has increased multifold. Three key components in QA systems include question classification, information retrieval, and answer extraction.'

In [36]:
import torch
from torchtext.data.metrics import bleu_score

# define the source text and reference text
reference_text = ["We have created a model which will allow users to enter PDF files as input and be able to summarize those file through Text Summarization and do Question Answering with those generated summaries."]
# define the text generated by the model
generated_text = [result]
# calculate the BLEU score
score = bleu_score(generated_text, reference_text)
print(f'BLEU Score: {score*100:.2f}')

BLEU Score: 0.00


In [37]:
df_summary_1 = pd.DataFrame(columns=['PDF','Summary'])

for index,row in df_manual.iterrows():
    result = model(row['Text'], num_sentences=3)
    print("1 File Done!")

    df_summary_1.loc[len(df_summary_1)] = [row['PDF'],result]


1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!
1 File Done!


In [38]:
df_summary_1

Unnamed: 0,PDF,Summary
0,Dataset\2312.15012.pdf,"DRAFT VERSION DECEMBER 27, 2023\nTypeset using..."
1,Dataset\2312.15018.pdf,"EA51CH25_Gabriel\nARjats.cls May 12, 2023 18:0..."
2,Dataset\2312.15038.pdf,We investigate how well the SPHEREx all-sky su...
3,Dataset\2312.15050.pdf,ARTICLE\nLarge planets may not form fractional...
4,Dataset\2312.15056.pdf,CO] 22 Dec 2023Present and future of CosmoLat...
5,Dataset\2312.15483.pdf,Observational constraints on extended Proca-Nu...
6,Dataset\2312.15761.pdf,"94, 247\n©SAIt 2023 Memorie della\nHigher orde..."
7,Dataset\2312.15766.pdf,TU-1217\nKEK-QUP-0036\nProbing Gauss-Bonnet-Co...
8,Dataset\2312.15992.pdf,In this work we develop a formalism for\nthe b...
9,Dataset\2312.16147.pdf,"Draft version December 27, 2023\nTypeset using..."


#### Visualizing Summaries: Word CLoud

In [39]:
import nltk
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# Create stopword list:
STOP_WORDS.add('otter')
stopwords = set(list(STOP_WORDS) +list(stopwords.words()))
stopwords.update(["br", "href", 'https'])
stopwords.update(stopwords)

textt = " ".join(desc for desc in df_summary.Summary)
print(textt)
wordcloud = WordCloud(stopwords=stopwords,background_colur='white').generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Part 3: Question Answering

#### Part 3.a: Langchain QA

In [None]:
from langchain.document_loaders import PyPDFLoader
loaders = PyPDFLoader("automatic text summarization.pdf")

#Load the document by calling loader.load()
pages = loaders.load()

print(len(pages))
print(pages[0].page_content[0:500])

In [None]:
docs = []

loaders = [PyPDFLoader("long-sample.pdf")]

for loader in loaders:
    docs.extend(loader.load())

In [None]:
# Define the Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

#Create a split of the document using the text splitter
splits = text_splitter.split_documents(docs)

In [None]:
import os
os.environ['OPENAI_API_KEY'] = ''

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

In [None]:
persist_directory = './chroma_db'

# Create the vector store
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

print(vectordb._collection.count())

In [None]:
question = "what did it say about Generalized Linear Models ?"

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

In [None]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [None]:
# Initilaize chain
# Set return_source_documents to True to get the source document
# Set chain_type to prompt template defines
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
result = qa_chain({"query": question})

# Check the result of the query
result["result"]

# Check the source document from where we
result["source_documents"][0]

In [None]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type="map_reduce"
)
result = qa_chain_mr({"query": question})
result["result"]

In [None]:
qa_chain_r = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type="refine"
)
result = qa_chain_r({"query": question})
result["result"]

#### Part 3.b: Falcon QA

In [41]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering

question = "On which date did Swansea City play its first Premier League game?"
context = "In 2011, a Welsh club participated in the Premier League for the first time after Swansea City gained promotion. The first Premier League match to be played outside England was Swansea City's home match at the Liberty Stadium against Wigan Athletic on 20 August 2011. In 2012\u201313, Swansea qualified for the Europa League by winning the League Cup. The number of Welsh clubs in the Premier League increased to two for the first time in 2013\u201314, as Cardiff City gained promotion, but Cardiff City was relegated after its maiden season."

tokenizer = AutoTokenizer.from_pretrained("Falconsai/question_answering_v2")
inputs = tokenizer(question, context, return_tensors="pt")

model = AutoModelForQuestionAnswering.from_pretrained("Falconsai/question_answering_v2")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'20 august 2011'

#### Part 3.c BERT QA

In [42]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import warnings
warnings.simplefilter("ignore")

weight_path = "kaporter/bert-base-uncased-finetuned-squad"
# loading tokenizer
tokenizer = BertTokenizer.from_pretrained(weight_path)
#loading the model
model = BertForQuestionAnswering.from_pretrained(weight_path)

In [43]:
question = "How many parameters does BERT-large have?"
context = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."

In [44]:
input_ids = tokenizer.encode(question, context)
print (f'We have about {len(input_ids)} tokens generated')

tokens = tokenizer.convert_ids_to_tokens(input_ids)

sep_idx = tokens.index('[SEP]')

# we will provide including [SEP] token which seperates question from context and 1 for rest.
token_type_ids = [0 for i in range(sep_idx+1)] + [1 for i in range(sep_idx+1,len(tokens))]

# Run our example through the model.
out = model(torch.tensor([input_ids]), # The tokens representing our input text.
            token_type_ids=torch.tensor([token_type_ids]))

start_logits,end_logits = out['start_logits'],out['end_logits']
# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_logits)
answer_end = torch.argmax(end_logits)

ans = ''.join(tokens[answer_start:answer_end])
print('Predicted answer:', ans)

We have about 70 tokens generated
Predicted answer: 340


In [45]:
question = "What is the AIM of this project?"
context = "We aim to create a Deep Learning-based Text Summarization and Question-Answering model. The model will allow for both text and PDF input and help users understand the document, summarize it, and ask follow-up questions."

In [46]:
input_ids = tokenizer.encode(question, context)
print (f'We have about {len(input_ids)} tokens generated')

tokens = tokenizer.convert_ids_to_tokens(input_ids)

sep_idx = tokens.index('[SEP]')

# we will provide including [SEP] token which seperates question from context and 1 for rest.
token_type_ids = [0 for i in range(sep_idx+1)] + [1 for i in range(sep_idx+1,len(tokens))]

# Run our example through the model.
out = model(torch.tensor([input_ids]), # The tokens representing our input text.
            token_type_ids=torch.tensor([token_type_ids]))

start_logits,end_logits = out['start_logits'],out['end_logits']
# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_logits)
answer_end = torch.argmax(end_logits)

ans = ''.join(tokens[answer_start:answer_end])
print('Predicted answer:', ans)

We have about 59 tokens generated
Predicted answer: tocreateadeeplearning-basedtextsum##mar##izationandquestion-answeringmodel.themodelwillallowforbothtextandpdfinputandhelpusersunderstandthedocument,sum##mar##izeit,andaskfollow-up
