In [44]:
import PyPDF2

In [45]:
# pdf_file= "IntelligentInvestor.pdf"
pdf_file = "chapter8.pdf"
with open(pdf_file, 'rb') as file: 
    reader = PyPDF2.PdfReader(file)
    text = ''
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()

with open('book_text.txt', 'w', encoding='utf-8') as f:
    f.write(text)


# 2. Preprocess the Data

Now that we have the raw text, we'll preprocess it to prepare for fine tuning

We will : 
* **Tokenization** : Break text into sentences and words
* **Cleaning** : Remove unwanted character, numbers and symbols
* **Normalization** : Convert text to lowercase

In [46]:
import nltk 
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abdel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [58]:
with open('book_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [59]:
def clean_text(text):
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'[^a-zA-Z0-9.,!?\'\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [60]:
cleaned_text = clean_text(text)

In [61]:
#Tokenize : 

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(cleaned_text)

In [62]:
with open('preprocessed_text.txt', 'w', encoding='utf-8') as f:
    for sentence in sentences : 
        f.write(sentence+'\n')

# Supervized Fine-Tuning (SFT)

In [63]:
from datasets import load_dataset, Dataset
dataset = Dataset.from_dict({"text": sentences})

In [64]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [65]:
def tokenize_fn(examples):
    model_inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    model_inputs["labels"] = model_inputs["input_ids"][:]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_fn, batched=True)


[A


Map: 100%|██████████| 444/444 [00:00<00:00, 3762.25 examples/s]


### Fine Tuning with SFT

In [66]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [67]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained('gpt2')

args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset = split_dataset['train'],
    eval_dataset = split_dataset['test']
)

In [68]:
trainer.train()

  6%|▌         | 354/5679 [39:31<9:54:26,  6.70s/it]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  4%|▍         | 10/255 [12:59<25:24,  6.22s/it]
[A

{'eval_loss': 0.1161196157336235, 'eval_runtime': 21.6854, 'eval_samples_per_second': 2.075, 'eval_steps_per_second': 0.553, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  4%|▍         | 10/255 [24:00<25:24,  6.22s/it]
[A

{'eval_loss': 0.11153853684663773, 'eval_runtime': 20.3557, 'eval_samples_per_second': 2.211, 'eval_steps_per_second': 0.59, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A                                         
                                                 
  4%|▍         | 10/255 [35:01<25:24,  6.22s/it]
[A
100%|██████████| 300/300 [33:20<00:00,  6.67s/it]

{'eval_loss': 0.11118684709072113, 'eval_runtime': 20.2802, 'eval_samples_per_second': 2.219, 'eval_steps_per_second': 0.592, 'epoch': 3.0}
{'train_runtime': 2000.7409, 'train_samples_per_second': 0.598, 'train_steps_per_second': 0.15, 'train_loss': 0.19372425079345704, 'epoch': 3.0}





TrainOutput(global_step=300, training_loss=0.19372425079345704, metrics={'train_runtime': 2000.7409, 'train_samples_per_second': 0.598, 'train_steps_per_second': 0.15, 'total_flos': 312766562304000.0, 'train_loss': 0.19372425079345704, 'epoch': 3.0})

# PEFT

In [71]:
args = TrainingArguments(
    output_dir="./model_output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_strategy="epoch",
    save_safetensors=False 
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset = split_dataset['train'],
    eval_dataset = split_dataset['test']
)

In [72]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

model = AutoModelForCausalLM.from_pretrained('gpt2')
model = get_peft_model(model, peft_config)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
)

trainer.train()
model.save_pretrained('peft_model')



100%|██████████| 300/300 [23:33<00:00,  4.71s/it]


{'train_runtime': 1413.2482, 'train_samples_per_second': 0.847, 'train_steps_per_second': 0.212, 'train_loss': 3.6742875162760416, 'epoch': 3.0}
