In [1]:
#Importing required dependencies
from transformers import pipeline
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io

In [2]:
def pdf2txt(inPDFfile, outTXTFile):
    inFile = open(inPDFfile, 'rb')
    resMgr = PDFResourceManager()
    retData = io.StringIO()
    TxtConverter = TextConverter(resMgr, retData, laparams=LAParams())
    interpreter = PDFPageInterpreter(resMgr, TxtConverter)
    
    #Process each page in pdf file
    for page in PDFPage.get_pages(inFile):
        interpreter.process_page(page)
        
    txt = retData.getvalue()
    
    #save output data to a txt file
    with open(outTXTFile, 'w') as f:
        f.write(txt)

In [3]:
#main
inPDFfile = 'test.pdf'
outTXTFile = 'file.txt'
pdf2txt(inPDFfile, outTXTFile)

In [4]:
#Creating summarizer instance
summarizer = pipeline("summarization")

In [5]:
#Open and read text file 
with open('file.txt') as f:
    text = f.readlines() 

#Combining the test into one long string
ARTICLE = ' '.join(text)

In [6]:
#This is to make text readable after splitting
ARTICLE = ARTICLE.replace('.', '.<eos>')
ARTICLE = ARTICLE.replace('!', '!<eos>')
ARTICLE = ARTICLE.replace('?', '?<eos>')

#Splitting ARTICLE into individual sentences using <eos>
sentences = ARTICLE.split('<eos>')

In [7]:
#Dividing the text (sentences) into smaller chunks to allow all the text be passed in using small chunks
max_chunk = 300
current_chunk = 0
chunks = []

for sentence in sentences:
    if len(chunks) == current_chunk + 1:
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

0


In [8]:
len(chunks)

49

In [9]:
#Checking words per chunk
chunks[0] 

['See',
 'discussions,',
 'stats,',
 'and',
 'author',
 'profiles',
 'for',
 'this',
 'publication',
 'at:',
 'https://www.',
 'researchgate.',
 'net/publication/350952336\n',
 '\n',
 'Effects',
 'of',
 'Layer-Charge',
 'Distribution',
 'on',
 'Swelling',
 'Behavior',
 'of',
 'Mixed-Layer\n',
 'Illite-Montmorillonite',
 'Clays:',
 'A',
 'Molecular',
 'Dynamics',
 'Simulation',
 'Study\n',
 '\n',
 'Article\xa0\xa0in\xa0\xa0Journal',
 'of',
 'Molecular',
 'Liquids',
 '·',
 'April',
 '2021\n',
 '\n',
 'DOI:',
 '10.',
 '1016/j.',
 'molliq.',
 '2021.',
 '116188\n',
 '\n',
 'CITATIONS\n',
 '0\n',
 '\n',
 '2',
 'authors,',
 'including:\n',
 '\n',
 'Me',
 'Ghasemi\n',
 'Nazarbayev',
 'University\n',
 '\n',
 '6',
 'PUBLICATIONS\xa0\xa0\xa027',
 'CITATIONS\xa0\xa0\xa0\n',
 '\n',
 'SEE',
 'PROFILE\n',
 '\n',
 'READS\n',
 '35\n',
 '\n',
 'Some',
 'of',
 'the',
 'authors',
 'of',
 'this',
 'publication',
 'are',
 'also',
 'working',
 'on',
 'these',
 'related',
 'projects:\n',
 '\n',
 'Molecular',


In [10]:
#Taking the most relevant portion of the file
to_summarize = chunks[0:15]

In [11]:
len(to_summarize)

15

In [12]:
#Joining individual words together again to form sentences
for chunk_id in range(len(to_summarize)):
    to_summarize[chunk_id] = ' '.join(to_summarize[chunk_id])

In [13]:
to_summarize[0]

'See discussions, stats, and author profiles for this publication at: https://www. researchgate. net/publication/350952336\n \n Effects of Layer-Charge Distribution on Swelling Behavior of Mixed-Layer\n Illite-Montmorillonite Clays: A Molecular Dynamics Simulation Study\n \n Article\xa0\xa0in\xa0\xa0Journal of Molecular Liquids · April 2021\n \n DOI: 10. 1016/j. molliq. 2021. 116188\n \n CITATIONS\n 0\n \n 2 authors, including:\n \n Me Ghasemi\n Nazarbayev University\n \n 6 PUBLICATIONS\xa0\xa0\xa027 CITATIONS\xa0\xa0\xa0\n \n SEE PROFILE\n \n READS\n 35\n \n Some of the authors of this publication are also working on these related projects:\n \n Molecular Dynamics Simulation of Clay Swelling View project\n \n All content following this page was uploaded by Me Ghasemi on 29 April 2021. \n \n The user has requested enhancement of the downloaded file. \n \n \x0cJournal of Molecular Liquids 335 (2021) 116188\n \n Contents lists available at ScienceDirect\n \n Journal of Molecular Liquids\

In [15]:
#Checking the number of words per chunk
len(to_summarize[14].split(' '))

295

In [16]:
#Passing the chunks to be summarized
result = summarizer(to_summarize, max_length=150, min_length=30, do_sample=False)

In [17]:
result 

[{'summary_text': ' Publication: Journal of Molecular Liquids 335 (2021) 116188 . Effects of layer-charge distribution on swelling behavior of mixed-layer illite-montmorillonite clays: A molecular dynamics simulation study .'},
 {'summary_text': ' About 30 percent of all clays are a pure type, and virtually 70 percent are the mixed-layer one that swelling mechanism is still elusive . Almost all previous experimental and simulation studies focused on the understanding of the swelling behavior of pure clays .'},
 {'summary_text': ' Clays are characterized by a large surface area, strong adsorption capacity, and high swelling capacity . The swelling behavior of clays is of importance in the oil and gas industry, as clays in various types are found in geological formation as well as drilling ﬂuids . Water interactions as a base of major water interactions with clayey formations can cause swelling, which can cause wellbore instability .'},
 {'summary_text': ' Osmotic swelling occurs due to 

In [18]:
#Extracting the summary text and joining all chunk summaries in res together.
text = ' '.join([summ['summary_text'] for summ in result])

In [19]:
#Saving summary into a txt file
with open ('summary5.txt', 'w') as f:
    f.write(text)

In [20]:
#Displaying final summary
print(text)

 Publication: Journal of Molecular Liquids 335 (2021) 116188 . Effects of layer-charge distribution on swelling behavior of mixed-layer illite-montmorillonite clays: A molecular dynamics simulation study .  About 30 percent of all clays are a pure type, and virtually 70 percent are the mixed-layer one that swelling mechanism is still elusive . Almost all previous experimental and simulation studies focused on the understanding of the swelling behavior of pure clays .  Clays are characterized by a large surface area, strong adsorption capacity, and high swelling capacity . The swelling behavior of clays is of importance in the oil and gas industry, as clays in various types are found in geological formation as well as drilling ﬂuids . Water interactions as a base of major water interactions with clayey formations can cause swelling, which can cause wellbore instability .  Osmotic swelling occurs due to an increase in interlayer water-consuming intake from an aqueous solution that culmin