Author: Dianhao Zhou

Date of Creation: Dec. 18th, 2023

Last Update: Dec. 18th, 2023

# DataPreprocessing(needed here, run before calling BERT)



In [None]:
!pip install PyPDF2

In [None]:
import zipfile
import os
from nltk.tokenize import sent_tokenize
import PyPDF2
import os
import nltk
from nltk.tokenize import word_tokenize
import re
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')

In [None]:
#data Preprocessing

#unzip
def unzip_file(zip_path, extract_folder):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

#remove editorial formats
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'·.*?·', '', text)    # Remove text in small dots
    text = re.sub(r'•', '', text)        # Remove bullets
    text = re.sub(r'\.\s*\.\s*\.\s*\.', '', text)  # Remove ellipses
    sentence_tokens = sent_tokenize(text)
    text = ' '.join(sent for sent in sentence_tokens if not sent.strip().endswith('?'))
    return text

#pdf to text
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                cleaned_text = clean_text(page_text)
                text += cleaned_text
    return text

#remove repeated clauses and tokenize into sentences(also can do maxlength here)
def preprocess_text(text, mode):
    repeated_clauses = ['ESSAYS ON SUICIDE AND THE IMMORTALITY OF THE SOUL',
                        'ESSAY II. ON THE IMMORTALITY OF THE SOUL.',
                        '"Enquiry Concerning Human Understanding"',
                        'David Hume',
                        'Online Library of Liberty: Essays Moral, Political, Literary (LF ed.)',
                        'PLL v6.0 (generated September, 2011)',
                        'http://oll.libertyfund.org/title/704',
                        'Dialogues concerning Natural Religion',
                        'Pamphilus to Hermippus']
    for clause in repeated_clauses:
        text = re.sub(clause, '', text)

    if mode == 'sen':

        sentence_tokens = sent_tokenize(text)

        return sentence_tokens, text
    if mode == 'max':
        word_tokens = word_tokenize(text)
        segmented_texts = []
        current_segment = []

        for token in word_tokens:
            if len(current_segment) + len(token) <= 30:#let's say 30
                current_segment.append(token)
            else:
                segmented_texts.append(" ".join(current_segment))
                current_segment = [token]

        if current_segment:
            segmented_texts.append(" ".join(current_segment))

        return segmented_texts, text
#read into a pdf dict for further embedding and a text dict to find relevant text
def read_pdfs_into_dict(folder_path,mode):
    pdf_dict = {}
    text_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(file_path)
            preprocessed_text, _ = preprocess_text(text,mode)
            pdf_dict[filename] = preprocessed_text
            text_dict[filename] = _
    return pdf_dict, text_dict

def preprocess(zip_path, folder,mode):

    os.makedirs(folder, exist_ok=True)

    unzip_file(zip_path, folder)

    folder_path = folder + '/' + folder
    pdf_text_dict, text_dict = read_pdfs_into_dict(folder_path,mode)
    return pdf_text_dict, text_dict


In [None]:
 #find relevant text wit topic word
def find_relevant_document(topic_word, documents):
    topic_counts = {doc_name: doc_text.count(topic_word) for doc_name, doc_text in documents.items()}
    relevant_doc = max(topic_counts, key=topic_counts.get)
    return relevant_doc, topic_counts[relevant_doc]

def get_relevant_document(topic_word, text_dict, pdf_text_dict):
    relevant_document,count = find_relevant_document(topic_word, text_dict)

    sentences = []
    for segment in pdf_text_dict[relevant_document]:
        sentences.append(segment)
    return sentences

In [None]:
#define variables
zip_path = "All.zip"
folder = "All"
mode = 'sen'
#preprocess
pdf_text_dict, text_dict = preprocess(zip_path, folder, mode)

In [None]:
#define variables
topic_word = "passion"
user_input = "what is passion?"
n = 10
#relevant sentences
sentences = get_relevant_document(topic_word, text_dict, pdf_text_dict)

# BERT'prompt_engineer'

In [None]:
#finding sentences

def bert_top_n(sentences, user_input, n):

    #load the tokenizer and model of choise
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    #figure out the divice we are running with
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        device = torch.device("cpu")
        print("Using CPU")

    model = model.to(device)


    hume_sentences = sentences
    def embed_text(text):# this embedding works for both input and hume texts
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)#tokenize it
        inputs = {k: v.to(device) for k, v in inputs.items()}#get them to devise
        with torch.no_grad():
            outputs = model(**inputs)#and get teh embedding
        return outputs.last_hidden_state.mean(dim=1).cpu()

    #embed both the input and background texts
    user_embedding = embed_text(user_input)
    sentence_embeddings = [embed_text(sentence) for sentence in hume_sentences]

    #find similarity
    similarities = [cosine_similarity(user_embedding.detach().numpy(), sentence_embedding.detach().numpy())[0][0] for sentence_embedding in sentence_embeddings]

    #find those top n similar sentences, could be passages if we are runnig with max length preprocessing
    top_n_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:n]
    top_n_sentences = [hume_sentences[i] for i in top_n_indices]

    #outputting a prompt.txt file at the same time for prompt generation
    with open('prompts.txt', "w") as f:
        for sentence in top_n_sentences:
            print(sentence)
            f.write(sentence + "\n")
        f.write("||\n")  # Delimiter for read_prompt function

    return top_n_sentences

In [None]:
# expecting input from DataPreprocessing.ipynb, sentences
result = bert_top_n(sentences,user_input,n)