In [4]:
import fitz  

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    
    full_text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  
        full_text += page.get_text("text")  
    
    return full_text

pdf_path = r"D:\hugging_face\perfix-tuning\A Game Of Thrones - George R. R. Martin (1).pdf"
text = extract_text_from_pdf(pdf_path)

with open("game_of_thrones_text.txt", "w", encoding="utf-8") as f:
    f.write(text)

print(f"Extracted text length: {len(text)} characters.")


Extracted text length: 1614019 characters.


In [6]:
import re

def clean_text(text):
    text = re.sub(r'\n\d+\n', '\n', text)
    
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', '', text)
    
    return text

cleaned_text = clean_text(text)

with open("cleaned_game_of_thrones_text.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"Cleaned text length: {len(cleaned_text)} characters.")


Cleaned text length: 1558962 characters.


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

model_name = r"D:\hugging_face\llama_3.2-1b_Model"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    load_in_8bit=False,  
    torch_dtype=torch.float16,  
    device_map="auto"
).to(device)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,  
    lora_alpha=32, 
    target_modules=["q_proj", "v_proj"],  
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

def load_got_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        texts = f.read().split('\n\n')
    return [{"text": text} for text in texts if text.strip()]

dataset = load_got_dataset(r'D:\hugging_face\perfix-tuning\cleaned_game_of_thrones_text.txt')
dataset = Dataset.from_list(dataset)

def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        truncation=True, 
        max_length=512, 
        padding='max_length'
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False  
)

training_args = TrainingArguments(
    output_dir='./got_llama_lora_model',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=1e-4,
    logging_dir='./logs',
    save_strategy='steps',
    save_steps=500,
    fp16=True,  
    dataloader_num_workers=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained('./got_llama_lora_model')

Using device: cuda


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Step,Training Loss


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

model_name = r"D:\hugging_face\llama_3.2-1b_Model"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

base_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,
    device_map="auto"
)

peft_model_id = "./got_llama_lora_model"
model = PeftModel.from_pretrained(base_model, peft_model_id)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def generate_text(prompt, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_length=max_length, 
            num_return_sequences=1,
            temperature=0.7,  
            top_p=0.9,        
            no_repeat_ngram_size=2
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

prompts = [
    "In the cold lands beyond the Wall, a lone ranger",
    "The winter winds howled across the battlements of Winterfell",
    "As the dragons circled overhead, the last Targaryen whispered",
    "In the shadowy halls of King's Landing, a conspiracy was brewing"
]

for prompt in prompts:
    try:
        result = generate_text(prompt)
        print(f"\nPrompt: {prompt}\nGenerated Text:\n{result}")
        print("\n" + "-"*50 + "\n")
    except Exception as e:
        print(f"An error occurred with prompt '{prompt}': {e}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Prompt: In the cold lands beyond the Wall, a lone ranger
Generated Text:
In the cold lands beyond the Wall, a lone ranger has been roaming the woods for months. He has come across a mysterious and terrible place where the world has changed and where everything has become more dangerous. The ranger knows that he must get back to his people, but he is not sure how to get there. Can he find his way home?
This is a story about a boy who has to overcome his fears and find a way to reach home.
This book is for readers who like to read about adventure and mystery.
It has a great plot, interesting characters, and an ending that will leave you wanting more!
The story is written in a simple and easy to understand language, so even young readers can enjoy it.
The illustrations are beautiful and will make you want to sit down and read the whole book!
There are also some nice touches like the fact that the boy has his own notebook which he uses to write down his thoughts and feelings as he travels

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Prompt: The winter winds howled across the battlements of Winterfell
Generated Text:
The winter winds howled across the battlements of Winterfell, but Bran Stark still had one more winter to endure before he could put the Night King to rest.
Winter is coming, and this time it’s not just the White Walkers who are coming to claim the Seven Kingdoms. The Night’s Watch has been infiltrated by the dead, the living, those undead, or the undead like. It’s a war without end. With a new threat in the form of the Mad King, a threat that will only grow as the winter progresses, Bran must fight to save his home and the people he loves.

--------------------------------------------------



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Prompt: As the dragons circled overhead, the last Targaryen whispered
Generated Text:
As the dragons circled overhead, the last Targaryen whispered to his queen, “Do you remember how you felt when you first met your husband? Do you recall the first time you laid eyes on him?”
The queen looked at her husband, and he smiled, a smile that was not a real smile. “I remember it like it was yesterday,” she said. Then she turned to the prince, who was leaning against the wall, watching the scene unfold with interest. She spoke to him softly, but the words were clear enough for everyone to hear.  “The first thing I noticed about my husband was his eyes. They were the color of the dragon’s eyes, fire and flame. He was beautiful, like a god.”
The prince smiled at his mother. His eyes were as blue as the sky, just like his father’s. And the fire in his own eyes was just as bright as his dragon father had been. But he didn’t look at the queen

--------------------------------------------------


P