In [1]:
# Imports
import pandas as pd
import numpy as np
import pickle
from stqdm import stqdm

# PDF
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams

import pdfplumber
import re
import io

# Summarization using extractive bert
from summarizer import Summarizer, sentence_handler
#import tensorflow_hub as hub

# BERT based models for document search
from sentence_transformers import SentenceTransformer

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
def load_models():   
    qa = SentenceTransformer('sentence-transformers/multi-qa-distilbert-dot-v1')
    summ = Summarizer('distilbert-base-uncased', hidden=[-1,-2], hidden_concat=True)
    return qa, summ


def load_pdf(file,n=0)->str:
    
    if isinstance(file, str):
        fp = open(file, 'rb')
    else: 
        fp = file
        
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    # Process each page contained in the document.
    for i, page in enumerate(PDFPage.get_pages(fp)):
        if i+1 > n:
            interpreter.process_page(page)
    text = retstr.getvalue()
    return text

def get_articles(text:str)->pd.DataFrame:
    
    data = text.split('\x0c')
    
    df = pd.DataFrame(enumerate(data,1),columns=['page','text'])
    
    df = df.assign(text=df['text'].str.split('\n\n')).explode('text').reset_index(drop=True)
    df['text'] = df['text'].str.strip().str.replace('\s+',' ')
    df['words'] = df['text'].apply(lambda x: len(x.split(' ')))
    df = df.loc[lambda x: x.text.astype(bool)].loc[lambda x: x.words>20].drop_duplicates(subset='text')
    df.drop('words',axis=1,inplace=True)

In [68]:
qa, summ = load_models()

2022-01-25 14:20:05.250 INFO    sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: sentence-transformers/multi-qa-distilbert-dot-v1
2022-01-25 14:20:05.269 INFO    sentence_transformers.SentenceTransformer: Did not find folder sentence-transformers/multi-qa-distilbert-dot-v1
2022-01-25 14:20:05.270 INFO    sentence_transformers.SentenceTransformer: Try to download model from server: https://sbert.net/models/sentence-transformers/multi-qa-distilbert-dot-v1.zip
2022-01-25 14:20:05.276 INFO    sentence_transformers.SentenceTransformer: Downloading sentence transformer model from https://sbert.net/models/sentence-transformers/multi-qa-distilbert-dot-v1.zip and saving it at C:\Users\adshafi/.cache\torch\sentence_transformers\sbert.net_models_sentence-transformers_multi-qa-distilbert-dot-v1
Exception when trying to download https://sbert.net/models/sentence-transformers/multi-qa-distilbert-dot-v1.zip. Response 404
2022-01-25 14:20:06.842 INFO    filelock: Lock 15

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=523.0), HTML(value='')))

2022-01-25 14:20:07.322 INFO    filelock: Lock 1589270693488 released on C:\Users\adshafi/.cache\huggingface\transformers\c8c7397094cde09588239e44541b09710ef9a899601c456f6fb8e2a44be94850.0c4b40a43fbbe2d04ddc2ff312b2cd13c8056078f520bb379117b25432d7c5fe.lock





2022-01-25 14:20:07.756 INFO    filelock: Lock 1589315941568 acquired on C:\Users\adshafi/.cache\huggingface\transformers\6e9e53c09b95f0ae1d36e8b36cf61de82d2294cd4c3912c6d25fafd05aff6e28.c1128180e82193c9bb4a2c1992d7107968aa54104c03a5d53ed92eb334203090.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=265482105.0), HTML(value='')))

2022-01-25 14:21:00.781 INFO    filelock: Lock 1589315941568 released on C:\Users\adshafi/.cache\huggingface\transformers\6e9e53c09b95f0ae1d36e8b36cf61de82d2294cd4c3912c6d25fafd05aff6e28.c1128180e82193c9bb4a2c1992d7107968aa54104c03a5d53ed92eb334203090.lock





2022-01-25 14:21:03.172 INFO    filelock: Lock 1589315253488 acquired on C:\Users\adshafi/.cache\huggingface\transformers\399726f9a9f5b3f46d502832a2c5ea35ce4236c1e9256c8d783eafcc2cbf8822.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))

2022-01-25 14:21:03.898 INFO    filelock: Lock 1589315253488 released on C:\Users\adshafi/.cache\huggingface\transformers\399726f9a9f5b3f46d502832a2c5ea35ce4236c1e9256c8d783eafcc2cbf8822.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock





2022-01-25 14:21:04.290 INFO    filelock: Lock 1589314489408 acquired on C:\Users\adshafi/.cache\huggingface\transformers\6e404d00243d2ea290bba9032b1d720c3f00e16fe82da5b0c91b28bcbaccd037.5b378830d936443f37ad9b396cd27fe2d43c0f99fa931e6d51e751f32b1df141.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=466247.0), HTML(value='')))

2022-01-25 14:21:07.178 INFO    filelock: Lock 1589314489408 released on C:\Users\adshafi/.cache\huggingface\transformers\6e404d00243d2ea290bba9032b1d720c3f00e16fe82da5b0c91b28bcbaccd037.5b378830d936443f37ad9b396cd27fe2d43c0f99fa931e6d51e751f32b1df141.lock





2022-01-25 14:21:08.028 INFO    filelock: Lock 1589314489408 acquired on C:\Users\adshafi/.cache\huggingface\transformers\e559210d80fa07981602833309f4a0c651d5676d60e911f201e4fec36b49916d.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))

2022-01-25 14:21:08.485 INFO    filelock: Lock 1589314489408 released on C:\Users\adshafi/.cache\huggingface\transformers\e559210d80fa07981602833309f4a0c651d5676d60e911f201e4fec36b49916d.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock





2022-01-25 14:21:08.874 INFO    filelock: Lock 1589315252624 acquired on C:\Users\adshafi/.cache\huggingface\transformers\c71dc1067871f671af8994af9e1730eea4ef810ba8ef7cf236152e984af745e3.42154c5fd30bfa7e34941d0d8ad26f8a3936990926fbe06b2da76dd749b1c6d4.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=333.0), HTML(value='')))

2022-01-25 14:21:09.345 INFO    filelock: Lock 1589315252624 released on C:\Users\adshafi/.cache\huggingface\transformers\c71dc1067871f671af8994af9e1730eea4ef810ba8ef7cf236152e984af745e3.42154c5fd30bfa7e34941d0d8ad26f8a3936990926fbe06b2da76dd749b1c6d4.lock
2022-01-25 14:21:09.433 INFO    sentence_transformers.SentenceTransformer: Use pytorch device: cpu





2022-01-25 14:21:09.821 INFO    filelock: Lock 1589316090416 acquired on C:\Users\adshafi/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=483.0), HTML(value='')))

2022-01-25 14:21:10.266 INFO    filelock: Lock 1589316090416 released on C:\Users\adshafi/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333.lock





2022-01-25 14:21:10.640 INFO    filelock: Lock 1589315096384 acquired on C:\Users\adshafi/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=267967963.0), HTML(value='')))

2022-01-25 14:22:02.672 INFO    filelock: Lock 1589315096384 released on C:\Users\adshafi/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a.lock





In [128]:
with pdfplumber.open('../pip_guide_2.pdf') as pdf:
    pdf_data = ' '.join([i.extract_text() for i in pdf.pages[3:]])

In [129]:
text = load_pdf('../pip_guide_2.pdf', 3)

In [127]:
def pdf_data_df(text):
    reg = re.compile(r'[\d]+[\.][\d]+[\.][\d]+')
    chapter_list = reg.findall(text)
    
    chapters = []
    for i in range(len(chapter_list)-1):
        chapters.append(text.split(chapter_list[i])[1].split(chapter_list[i+1])[0])

    chapters.append(text.split(chapter_list[-1])[1].split('2.3')[0])
    
    a = pd.DataFrame(zip(chapter_list,chapters),columns=['chapter','text'])
    a['text'] = a['text'].str.replace('\n',' ').str.replace('\x0c', '').str.strip()
    
    return a

In [136]:
test = pdf_data_df(text)
a = pdf_data_df(pdf_data)

In [133]:
test[test['chapter']=='2.2.8']

Unnamed: 0,chapter,text
38,2.2.8,


In [135]:
test2[test2['chapter']=='2.2.8']

Unnamed: 0,chapter,text
38,2.2.8,"As made clear in legislation, harm is in relat..."


In [137]:
c = a['text'].apply(qa.encode).apply(pd.Series)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [138]:
def ask(q:str, X:pd.DataFrame, s:pd.DataFrame, n: int, model)->pd.Series:
    
    embedding = np.array(model.encode([q])[0])
        
    sorted_index = (X
                    .apply(lambda row: np.dot(row, embedding), axis=1)
                    .abs()
                    .sort_values(ascending=False)
                   )
    
    return s.loc[sorted_index.index].head(n)

In [139]:
ans = ask('preparing food activity', X=c, s=a, n=3, model=qa)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [140]:
ans

Unnamed: 0,chapter,text
59,2.2.29,"In the Preparing Food activity, the HP should ..."
58,2.2.28,Ms Z can prepare and cook a simple meal. Howev...
46,2.2.16,Where the act of completing the activity means...


In [141]:
a.to_csv('paragraphs.csv', index=False)
c.to_csv('paragraphs_embedded.csv', index=False)