In [1]:
#Imports

#env + system imports
import os
from dotenv import load_dotenv
#pincone
import pinecone  
from haystack.document_stores import PineconeDocumentStore
from haystack.nodes import PDFToTextConverter, PreProcessor, EmbeddingRetriever



  from tqdm.autonotebook import tqdm





In [2]:
#Load environment variables from .env file
        # (overide = true) just forces a reload on the .env file in case api key changes
# dotenv_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..','..','..', 'client', '.env')
load_dotenv(override=True)
# Access the API key
pinecone_api_key = os.getenv("PINECONE_API_KEY")
huggingface_api_token = os.getenv("HUGGING_FACE_API_TOKEN")   


In [3]:
# Initialize the pinecone index
index_name='haystack'
pinecone.init(      
api_key=pinecone_api_key,      
environment='gcp-starter'      
)      
index = pinecone.Index(index_name=index_name)

# Initialize the haystack document store object
document_store = PineconeDocumentStore(
api_key=pinecone_api_key,
pinecone_index=index,
similarity="cosine",
embedding_dim=768
)

In [13]:
path=".\Textbooks\Bellack A.S., Hersen M. (eds.) - Comprehensive Clinical Psychology. Volume 9-Elsevier (2000).pdf"

In [27]:
#document preprocessing for summarization
#Split by passage instead, split length referring to num of passages in vector
#Won't work on books with very long chapters, but
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"], keep_physical_layout=False, multiprocessing=True)
doc_pdf = converter.convert(file_path=path, meta=None)[0]
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="token",
split_length=900,
split_respect_sentence_boundary=True,    #prevents sentences from being cut off
add_page_number=True,
)
docs = preprocessor.process([doc_pdf])


Preprocessing: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.30s/docs]


In [28]:
print(len(docs))

681


In [None]:
#document preprocessing for QA
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"], keep_physical_layout=False, multiprocessing=True)
doc_pdf = converter.convert(file_path=path, meta=None)[0]
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=500,
split_respect_sentence_boundary=True,    #prevents sentences from being cut off
)
docs = preprocessor.process([doc_pdf])



In [12]:
print(docs[0].content)

CRACKING
the ·
CODING INTERVIEW
189 PROGRAMMING Q!JESTIONS & SOLUTIONSCRACKING
the
CODING INTERVIEWALso BY GAYLE LAAKMANN McDowELL
(RACKING THE PM INTERVIEW
How TO LAND A PRODUCT MANAGER JoB IN TECHNOLOGY
CRACKING THE TECH CAREER
INSIDER ADVICE ON LANDING A JOB AT GOOGLE, MICROSOFT, APPLE, OR ANY TOP TECH COMPANYCRACKING
the
CODING INTERVIEW
189 Programming Questions and Solutions
GAYLE LAAKMANN MCDOWELL
Founder and CEO, CareerCup.com
CareerCup, LLC
Palo Alto, CACRACKING THE CODING INTERVIEW, SIXTH EDITION
Copyright © 2015 by CareerCup.
All rights reserved. No part of this book may be reproduced in any form by any electronic or me­
chanical means, including information storage and retrieval systems, without permission in writing
from the author or publisher, except by a reviewer who may quote brief passages in a review.
Published by CareerCup, LLC, Palo Alto, CA. Compiled Feb 10, 2016.
For more information, contact support@careercup.com.For Davis and Tobin,
and all the things th

In [29]:
num_docs = len(docs)
for i in range(num_docs):
    strip_text = docs[i].content.replace('\n', ' ')
    docs[i].content = strip_text
    


In [36]:
print(docs[1].meta)

{'_split_id': 1, 'page': 2}


In [37]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [40]:
begin_page = 1
end_page = 4

In [60]:
begin = False
end = False
relevant_docs = []

for i in range(num_docs):
    meta = docs[i].meta
    curr_page = meta['page']
    if curr_page >= end_page and not end:
        if begin:  
            end = True
            begin=False
        else: 
            begin = True
            i = i - 1 

    if curr_page == begin_page:
        begin = True

    if begin:
        relevant_docs.append(docs[i].content) 

    if end:
        relevant_docs.append(docs[i].content)

    if end:
        break


In [61]:
relevant_docs_length = len(relevant_docs)
relevant_docs_length * 900 * .3

810.0

In [80]:
relevant_docs_length = len(relevant_docs)
summ_docs = []
for i in range(relevant_docs_length):
    summ_text = summarizer(relevant_docs[i], max_length=150, min_length=40, do_sample=False)
    summ_docs.append(summ_text)

total_text = ""
print(type(summ_docs[0]))
for doc in summ_docs:
    total_text += doc[0].get("summary_text")
summ_text = summarizer(total_text, max_length=200, min_length=100, do_sample=False)

<class 'list'>


In [81]:
print(summ_text[0].get("summary_text"))

This volume reflects the breadth of research and clinical knowledge that psychologists have contributed to over the past several decades. Like psychiatry, Western psychology appears to be bound by its commitment to a disease conception of mental disorders. This has resulted in the unfortunate situation where many clinicians believe that mental disorders are biologically caused. There is a growing literature showing that the experience of being mentally ill is an interpretive enterprise, constructed by individuals. Volume 9: Applications in Diverse Populations Preface Contributors Section I: Individuals with Disabling Conditions, , 9.01  Mental Retardation,  Pages 1-17, Sarah W. Bisconer Janene N. Suttie. Section II: Families, , No title 9.11  Families of Individuals with HIV Infection/AIDS, Pages 169-182, Marc Tass&#xe9.


In [25]:
print(summarizer(docs[40].content, max_length=130, min_length=30, do_sample=False))

[{'summary_text': 'Obsessive thoughts and compulsive behaviors in adults with autism and OCD have been found to differ in some respects from those seen in OCD patients who are not autistic. Hyperactivity is a commonly reported beha- vioral feature of children with PDD. Stereotypy, self-injury, and pica are common associated behavior problems.'}]


In [26]:
print(docs[40].content)

Many younger, lower-functioning children with Phenomenologyautism display repetitive motor movements such as: rocking; toe-walking; arm, hand, or finger flapping; and whirling (Volkmar, Cohen, & Paul, 1986). These behaviors have a compul- sive quality and may be associated with anxiety but their relationship to obsessive-compulsive disorder (OCD), if any, is unclear. Obsessional thinking is also common in persons in the PDD spectrum. Such thinking is most common, or at least most readily de- tected, in highly verbal individuals. The content of obsessional thinking in PDD is likely to be idiosyncratic, though it may be similar in other respects to that described for OCD. Obsessive thoughts and compulsive behaviors in adults with autism and OCD have been found to differ in some respects from those seen in OCD patients who are not autistic; individuals with autism and OCD were more likely to display hoarding, touching, tapping, rubbing, and self- damaging behavior and were less likely to