Author: Dianhao Zhou

Date of Creation: Dec. 18th, 2023

Last Update: Dec. 18th, 2023

Here we are doing data preprocessing, taking in a .zip file that have all texts in pdfs, and outputting desirable preprocessed workable data.

In [None]:
!pip install PyPDF2

In [None]:
#import packages
import zipfile
import os
from nltk.tokenize import sent_tokenize
import PyPDF2
import os
import nltk
from nltk.tokenize import word_tokenize
import re
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')

In [1]:
#data Preprocessing

#unzip
def unzip_file(zip_path, extract_folder):
    with zipfile.ZipFile(zip_path + '.zip', 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

#remove editorial formats
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'·.*?·', '', text)    # Remove text in small dots
    text = re.sub(r'•', '', text)        # Remove bullets
    text = re.sub(r'\.\s*\.\s*\.\s*\.', '', text)  # Remove ellipses
    sentence_tokens = sent_tokenize(text)
    text = ' '.join(sent for sent in sentence_tokens if not sent.strip().endswith('?'))
    return text

#pdf to text
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):#for each pages
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:#we add cleaned texts
                cleaned_text = clean_text(page_text)
                text += cleaned_text
    return text

#remove repeated clauses and tokenize into sentences(also can do maxlength here)
def preprocess_text(text, mode):
    #need to remove repeated clauses
    repeated_clauses = ['ESSAYS ON SUICIDE AND THE IMMORTALITY OF THE SOUL',
                        'ESSAY II. ON THE IMMORTALITY OF THE SOUL.',
                        '"Enquiry Concerning Human Understanding"',
                        'David Hume',
                        'Online Library of Liberty: Essays Moral, Political, Literary (LF ed.)',
                        'PLL v6.0 (generated September, 2011)',
                        'http://oll.libertyfund.org/title/704',
                        'Dialogues concerning Natural Religion',
                        'Pamphilus to Hermippus']
    #removal happens here
    for clause in repeated_clauses:
        text = re.sub(clause, '', text)

    #we have two modes, here is tokenizing into sentences
    if mode == 'sen':

        sentence_tokens = sent_tokenize(text)

        return sentence_tokens, text

    #here is another, tokenizing accordong to maximum length. We set a default of 30
    if mode == 'max':
        word_tokens = word_tokenize(text)
        segmented_texts = []
        current_segment = []

        for token in word_tokens:
            if len(current_segment) + len(token) <= 30:#let's say 30
                current_segment.append(token)
            else:
                segmented_texts.append(" ".join(current_segment))
                current_segment = [token]

        if current_segment:
            segmented_texts.append(" ".join(current_segment))

        return segmented_texts, text
#read into a pdf dict for further embedding and a text dict to find relevant text
def read_pdfs_into_dict(folder_path,mode):
    #we prepare pdf_dict which contains tokenized text for further training, text_dict for BERT'prompr engineer'
    pdf_dict = {}
    text_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            #first extract
            text = extract_text_from_pdf(file_path)
            #then preprocess, with cleaning process included in it
            preprocessed_text, _ = preprocess_text(text,mode)
            pdf_dict[filename] = preprocessed_text
            text_dict[filename] = _
    return pdf_dict, text_dict

def preprocess(zip_path, folder,mode):#everything in one

    os.makedirs(folder, exist_ok=True)

    unzip_file(zip_path + '.zip', folder)

    folder_path = zip_path + '/' + folder
    pdf_text_dict, text_dict = read_pdfs_into_dict(folder_path,mode)
    return pdf_text_dict, text_dict

In [None]:
# Data Preprocess for GPT2 fine tuning
def for_gpt2(zip_path, folder, mode='sen'):#to parse in sentences by default
    pdf_text_dict, _ = preprocess(zip_path, folder, mode)

    sentences = []
    #now writing into a .txt file for GPT fine tuning
    text_data = open('Sentences.txt', 'w')

    for filename, segmented_text in pdf_text_dict.items():
        print(f"File: {filename}")
        for i, segment in enumerate(segmented_text):
            text_data.write(segment)
            sentences.append(segment)

    text_data.close()
    return sentences

In [None]:
#unzip and preprocess the All.zip file example, remember to load the zip before running
zip_path = "All"
folder = "All"
sentences = for_gpt2(zip_path, folder)

In [None]:
# special preprocess for BERT 'prompt engineer'

 #find relevant text wit topic word
def find_relevant_document(topic_word, documents):
    topic_counts = {doc_name: doc_text.count(topic_word) for doc_name, doc_text in documents.items()}#relevance is measured by wordcount
    relevant_doc = max(topic_counts, key=topic_counts.get)#find the most relevant one using this metric
    return relevant_doc

def get_relevant_document(topic_word, text_dict, pdf_text_dict):

    #call the find function
    relevant_document= find_relevant_document(topic_word, text_dict)

    #get the sentences for prompt engineering
    sentences = []
    for segment in pdf_text_dict[relevant_document]:
        sentences.append(segment)
    return sentences

In [None]:
#unzip and preprocess the All.zip file example, remember to load the zip before running

#define variables
zip_path = "All.zip"
folder = "All"
mode = 'max'
#preprocess
pdf_text_dict, text_dict = preprocess(zip_path, folder, mode)

#define variables
topic_word = "passion"
user_input = "what is passion?"
n = 5
#relevant sentences
sentences = get_relevant_document(topic_word, text_dict, pdf_text_dict)

In [None]:
#both at once

#define variables
zip_path = "All.zip"
folder = "All"
mode = 'max'
#preprocess
pdf_text_dict, text_dict = preprocess(zip_path, folder, mode)

text_data = open('Sentences.txt', 'w')

for filename, segmented_text in pdf_text_dict.items():
    print(f"File: {filename}")
    for i, segment in enumerate(segmented_text):
        text_data.write(segment)
        sentences.append(segment)

text_data.close()

#define variables
topic_word = "passion"
user_input = "what is passion?"
n = 5
#relevant sentences
sentences = get_relevant_document(topic_word, text_dict, pdf_text_dict)