In [None]:
#!pip install PyPDF2
#!pip install nltk

In [2]:
#unzip
import zipfile
import os

def unzip_file(zip_path, extract_folder):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

zip_file_path = 'All.zip'
extract_folder_path = 'All'

os.makedirs(extract_folder_path, exist_ok=True)

unzip_file(zip_file_path, extract_folder_path)

In [8]:
#12.15 update sentences
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
import re

#nltk.download('punkt')

def clean_text(text):
    # Remove editorial explanations and added material
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'·.*?·', '', text)    # Remove text in small dots
    text = re.sub(r'•', '', text)        # Remove bullets
    text = re.sub(r'\.\s*\.\s*\.\s*\.', '', text)  # Remove ellipses
    return text

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:  # Ensure there is text on the page
                cleaned_text = clean_text(page_text)
                text += cleaned_text
    return text

from nltk.tokenize import sent_tokenize

def preprocess_text(text):
    repeated_clauses = ['ESSAYS ON SUICIDE AND THE IMMORTALITY OF THE SOUL',
                        'ESSAY II. ON THE IMMORTALITY OF THE SOUL.',
                        '"Enquiry Concerning Human Understanding"',
                        'David Hume',
                        'Online Library of Liberty: Essays Moral, Political, Literary (LF ed.)',
                        'PLL v6.0 (generated September, 2011)',
                        'http://oll.libertyfund.org/title/704',
                        'Dialogues concerning Natural Religion',
                        'Pamphilus to Hermippus']

    for clause in repeated_clauses:
        text = re.sub(clause, '', text)

    # Split the text into sentences
    sentence_tokens = sent_tokenize(text)

    return sentence_tokens

# Rest of your code remains the same

def read_pdfs_into_dict(folder_path):
    pdf_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(file_path)
            preprocessed_text = preprocess_text(text)
            pdf_dict[filename] = preprocessed_text
    return pdf_dict

# Replace 'your_folder_path' with the actual path to your folder containing PDF files
folder_path = 'All/All'
pdf_text_dict = read_pdfs_into_dict(folder_path)

#sentences = []
text_data = open('Sentences.txt', 'w')

for filename, segmented_text in pdf_text_dict.items():
    print(f"File: {filename}")
    for i, segment in enumerate(segmented_text):
        #sentences.append(segment)
        text_data.write(segment)

text_data.close()

File: Enquiry_Moral.pdf
File: Peg.pdf
File: Dissertations.pdf
File: Enquiry_Understanding.pdf
File: Essay_Soul.pdf
File: Dialogues.pdf
File: Life.pdf
File: Essay_Moral.pdf
File: Letter.pdf
File: History.pdf
File: Treatise.pdf


In [1]:
# Model Training
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [2]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [1]:
# you need to set parameters
train_file_path = "Sentences.txt"
model_name = 'gpt2-large'
output_dir = 'gpt2-large_result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [4]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,2.8579
1000,2.3537
1500,2.2353
2000,2.1562
2500,1.9059
3000,1.8122
3500,1.8112
4000,1.8235
4500,1.6247
5000,1.4955


In [2]:
# Inference
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [3]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(sequence, max_length):
    model_path = output_dir
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        #do_sample=True,
        max_length=max_length,
        num_beams = 5,
        no_repeat_ngram_size = 5,
        pad_token_id=model.config.eos_token_id,
        #top_k=50,
        #top_p=0.95,
        early_stopping = True
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [1]:
# Simran's code
def read_prompt(f):
    output = ""
    first = True
    while True:
        line = f.readline()
        line = line.strip()
        if line == "||":
            output += "\n"
            return line, output
        elif not line:
            return line, output
        # start of a new prompt
        if first:
            output = "I have a question: "
            output += line
            output += " and I find these texts to be relevant: "
            first = False
        else:
            output += line
            output += " "
            
with open("prompts.txt", "r") as f:
    output1 = ""
    while True:
        line, prompt = read_prompt(f)
        output1 += prompt
        #print(line, prompt)
        if not line:
            break
    with open("output.txt", "w") as f_out:
        f_out.write(output1)

In [4]:
# Generating from prompts (fine-tuned model)
model_path = output_dir
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)

max_length = 500

def finetune_generate(sequence):
    ids = tokenizer.encode(sequence, return_tensors = 'pt')
    output = model.generate(
        ids,
        max_length=max_length,
        num_beams = 5,
        no_repeat_ngram_size = 5,
        pad_token_id=model.config.eos_token_id,
        early_stopping = True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [5]:
prompts_file = open('short_prompts.txt', 'r')
prompts_list = prompts_file.readlines()
prompts_file.close()

generated_finetune = open('generated_finetuned.txt', 'w')

for item in prompts_list:
    finetuned_text = finetune_generate(item)
    spaced_finetuned = finetuned_text.replace("\n", "").replace(".", ". ").replace("?", "? ").replace("!", "! ")
    generated_finetune.write(spaced_finetuned + "\n")

generated_finetune.close()

In [6]:
# Generating from prompts (base model)
gpt_size = 'gpt2-large'

standard_tokenizer = GPT2Tokenizer.from_pretrained(gpt_size)
standard_model = GPT2LMHeadModel.from_pretrained(gpt_size , pad_token_id = standard_tokenizer.eos_token_id )
standard_tokenizer.decode(standard_tokenizer.eos_token_id)

def standard_generate(sequence):
    input_ids = standard_tokenizer.encode(sequence , return_tensors = 'pt')
    output = standard_model.generate(input_ids, max_length = max_length, num_beams = 5,no_repeat_ngram_size  = 5 , early_stopping = True)
    
    return standard_tokenizer.decode(output[0] , skip_special_tokens = True)

In [7]:
generated_base = open('generated_base.txt', 'w')

for item in prompts_list:
    base_text = standard_generate(item)
    spaced_base = base_text.replace("\n", "").replace(".", ". ").replace("?", "? ").replace("!", "! ")
    generated_base.write(spaced_base + "\n")

generated_base.close()