In [None]:
!pip install -q accelerate peft bitsandbytes transformers trl sentencepiece

# Data Preprocessing


In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSeq2SeqLM, pipeline
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load dataset
# Replace with your dataset name
dataset = load_dataset('FareedKhan/1k_stories_100_genre')

# Load NER pipeline
model_name = "dslim/bert-base-NER"  # Pre-trained NER model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline(
    "ner", model=model, tokenizer=tokenizer, grouped_entities=True, device=0)

# Function to extract characters using NER


def extract_characters(example):
    story = example['story']
    entities = ner_pipeline(story)  # Run NER on the story
   # Filter out only PERSON entities and ensure uniqueness
    characters = list(
        set(entity['word'] for entity in entities if entity['entity_group'] == 'PER'))
    return {"characters": characters}


# Apply the function to add the characters column
dataset = dataset.map(extract_characters)

# Inspect the updated dataset
print(dataset)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'story', 'genre', 'characters'],
        num_rows: 1000
    })
})


In [3]:
dataset['train']['characters'][:5]

[['Evelyn',
  'Eve',
  'Art',
  'Reynolds',
  'Hart',
  'John',
  'Turner',
  'Amelia Hart',
  'Arthur',
  'Simmons',
  'Jack'],
 ['Thorn', 'O'],
 ['Johnath', 'John'],
 ['Blackwood', 'Thomas', 'William', 'Elias'],
 ['Ben', 'Katie', 'Sarah', 'Mark', 'Alex']]

In [4]:
def format_prompt(example):
    """Create a structured and guiding prompt for story generation."""

    # Extract data from the example
    genre = example["genre"]
    # Join characters into a single string
    characters = ", ".join(example["characters"])
    story = example["story"]
    title = example["title"]

    # Create the prompt text with a guiding introduction
    prompt = (
        "Create a story based on the given genre and title. "
        f"Genre: {genre}\n"
        f"Title:\n{title}\n\n"
        "Ensure the output includes the following in order:\n"
        "1. Characters\n"
        "2. The story\n"
        "Output:\n"
        f"Characters: {characters}\n"
        f"Story: {story}\n"
    )

    return {"text": prompt}


# Apply the formatting to the dataset
formatted_dataset = dataset.map(format_prompt)

In [5]:
print(formatted_dataset['train']['text'][:5])

['Create a story based on the given genre and title. Genre: Science Fiction\nTitle:\nThe Chronicles of the Cosmic Rift\n\nEnsure the output includes the following in order:\n1. Characters\n2. The story\nOutput:\nCharacters: Evelyn, Eve, Art, Reynolds, Hart, John, Turner, Amelia Hart, Arthur, Simmons, Jack\nStory: In the year 2250, Earth had made significant strides in space exploration and interstellar travel. The United Earth Government (UEG) had established colonies on Mars, Jupiter\'s moon Europa, and Saturn\'s moon Titan. The advancements in technology and science had led to the creation of the Cosmic Rift Exploration Agency (CREA), a government-funded organization tasked with exploring the unknown regions of space and discovering new worlds and resources.\n\n    Dr. Amelia Hart, a brilliant astrophysicist, was the lead scientist at CREA\'s headquarters on Luna. She had devoted her entire life to understanding the mysteries of the universe and had become a pioneer in her field. She

In [6]:
formatted_dataset.save_to_disk("./datasets/processed_stories")

Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 18200.65 examples/s]


In [2]:
from datasets import load_from_disk

# Load the dataset from the saved directory
# Replace with the path you used earlier
dataset = load_from_disk("./datasets/processed_stories")

# Inspect the loaded dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'story', 'genre', 'characters', 'text'],
        num_rows: 1000
    })
})


In [3]:
dataset = dataset.filter(lambda example, index: index %
                         100 == 0, with_indices=True)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'story', 'genre', 'characters', 'text'],
        num_rows: 10
    })
})


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type
    bnb_4bit_compute_dtype="float16",  # Compute dtype
    bnb_4bit_use_double_quant=True,  # Apply nested quantization
    llm_int8_enable_fp32_cpu_offload=True,
)

# Load the model to train on the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    # Leave this out for regular SFT
    quantization_config=bnb_config,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "right"

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
# peft_config = LoraConfig(
#     lora_alpha=32,  # LoRA Scaling
#     lora_dropout=0.1,  # Dropout for LoRA Layers
#     r=64,  # Rank
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=[
#         # Self-attention projection layers
#         'self_attn.q_proj',
#         'self_attn.k_proj',
#         'self_attn.v_proj',
#         'self_attn.o_proj',

#         # MLP layers
#         'mlp.gate_proj',
#         'mlp.up_proj',
#         'mlp.down_proj'
#     ]
# )

peft_config = LoraConfig(
    lora_alpha=32,  # LoRA Scaling
    lora_dropout=0.1,  # Dropout for LoRA Layers
    r=64,  # Rank
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=# Layers to target
    ['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [7]:
from transformers import TrainingArguments

output_dir = "./results"

# Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True
)

In [8]:
for name, module in model.named_modules():
    print(name)


base_model
base_model.model
base_model.model.model
base_model.model.model.embed_tokens
base_model.model.model.layers
base_model.model.model.layers.0
base_model.model.model.layers.0.self_attn
base_model.model.model.layers.0.self_attn.q_proj
base_model.model.model.layers.0.self_attn.q_proj.base_layer
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout.default
base_model.model.model.layers.0.self_attn.q_proj.lora_A
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default
base_model.model.model.layers.0.self_attn.q_proj.lora_B
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_A
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_B
base_model.model.model.layers.0.self_attn.q_proj.lora_magnitude_vector
base_model.model.model.layers.0.self_attn.k_proj
base_model.model.model.layers.0.self_attn.k_proj.base_layer
base_model.mode

In [9]:
from trl import SFTTrainer

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=512,

    # Leave this out for regular SFT
    peft_config=peft_config,
)

# Train model
trainer.train()

# Save QLoRA weights
trainer.model.save_pretrained("Mistral-7B-Instruct-v0.3-QLoRA")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 10/10 [00:00<00:00, 231.01 examples/s]


Step,Training Loss


In [10]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "Mistral-7B-Instruct-v0.3-QLoRA",
    low_cpu_mem_usage=True,
    device_map="auto",
    offload_folder="./offload"
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [None]:
# Save the merged model
merged_model.save_pretrained("Mistral-7B-Instruct-v0.3-Merged")
tokenizer.save_pretrained("Mistral-7B-Instruct-v0.3-Merged")

In [None]:
from transformers import pipeline

# Use our predefined prompt template
prompt = """Create a story based on the given genre and title.
"""

# Run our instruction-tuned model
pipe = pipeline(task="text-generation",
                model=merged_model, tokenizer=tokenizer)
print(pipe(prompt))

In [None]:
prompt = """Create a story based on the given genre and title.
"""

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Move to GPU
outputs = merged_model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))