<a href="https://colab.research.google.com/github/Fulim13/Storyteller/blob/test/api/fine-tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate peft bitsandbytes transformers trl sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the f

# Data Preprocessing


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSeq2SeqLM, pipeline
from datasets import load_dataset

In [None]:

# Load dataset
# Replace with your dataset name
dataset = load_dataset('FareedKhan/1k_stories_100_genre')

# Load NER pipeline
model_name = "dslim/bert-base-NER"  # Pre-trained NER model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline(
    "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0)

# Function to extract characters using NER


def extract_characters(example):
    story = example['story']
    entities = ner_pipeline(story)  # Run NER on the story
   # Filter out only PERSON entities and ensure uniqueness
    characters = list(
        set(entity['word'] for entity in entities if entity['entity_group'] == 'PER'))
    return {"characters": characters}


# Apply the function to add the characters column
dataset = dataset.map(extract_characters)

# Inspect the updated dataset
print(dataset)

In [4]:
dataset['train']['characters'][:5]

[['Turner',
  'Art',
  'Jack',
  'Evelyn',
  'Simmons',
  'Arthur',
  'Reynolds',
  'John',
  'Hart',
  'Amelia Hart',
  'Eve'],
 ['Thorn', 'O'],
 ['Johnath', 'John'],
 ['William', 'Blackwood', 'Thomas', 'Elias'],
 ['Ben', 'Mark', 'Katie', 'Sarah', 'Alex']]

In [5]:
def format_prompt(example):
    """Create a structured and guiding prompt for story generation."""

    # Extract data from the example
    genre = example["genre"]
    # Join characters into a single string
    characters = ", ".join(example["characters"])
    story = example["story"]
    title = example["title"]

    # Create the prompt text with a guiding introduction
    prompt = (
        "Create a story based on the given genre and title. "
        f"Genre: {genre}\n"
        f"Title:\n{title}\n\n"
        "Ensure the output includes the following in order:\n"
        "1. Characters\n"
        "2. The story\n"
        "Output:\n"
        f"Characters: {characters}\n"
        f"Story: {story}\n"
    )

    return {"text": prompt}


# Apply the formatting to the dataset
formatted_dataset = dataset.map(format_prompt)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
print(formatted_dataset['train']['text'][:5])

['Create a story based on the given genre and title. Genre: Science Fiction\nTitle:\nThe Chronicles of the Cosmic Rift\n\nEnsure the output includes the following in order:\n1. Characters\n2. The story\nOutput:\nCharacters: Turner, Art, Jack, Evelyn, Simmons, Arthur, Reynolds, John, Hart, Amelia Hart, Eve\nStory: In the year 2250, Earth had made significant strides in space exploration and interstellar travel. The United Earth Government (UEG) had established colonies on Mars, Jupiter\'s moon Europa, and Saturn\'s moon Titan. The advancements in technology and science had led to the creation of the Cosmic Rift Exploration Agency (CREA), a government-funded organization tasked with exploring the unknown regions of space and discovering new worlds and resources.\n\n    Dr. Amelia Hart, a brilliant astrophysicist, was the lead scientist at CREA\'s headquarters on Luna. She had devoted her entire life to understanding the mysteries of the universe and had become a pioneer in her field. She

In [7]:
formatted_dataset.save_to_disk("./datasets/processed_stories")

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
from datasets import load_from_disk

# Load the dataset from the saved directory
# Replace with the path you used earlier
dataset = load_from_disk("./datasets/processed_stories")

# Inspect the loaded dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'story', 'genre', 'characters', 'text'],
        num_rows: 1000
    })
})


In [9]:
dataset = dataset.filter(lambda example, index: index %
                         100 == 0, with_indices=True)
print(dataset)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'story', 'genre', 'characters', 'text'],
        num_rows: 10
    })
})


In [14]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from datasets import load_dataset

# Load dataset with specified split
dataset = load_dataset('FareedKhan/1k_stories_100_genre', split="train")

# Shuffle and select a range
dataset = dataset.shuffle(seed=42).select(range(10))

# Load NER pipeline
model_name = "dslim/bert-base-NER"  # Pre-trained NER model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline(
    "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0)

# Function to extract characters using NER
def extract_characters(example):
    story = example['story']
    entities = ner_pipeline(story)  # Run NER on the story
    # Filter out only PERSON entities and ensure uniqueness
    characters = list(
        set(entity['word'] for entity in entities if entity['entity_group'] == 'PER'))
    return {"characters": characters}

# Apply the function to add the characters column
dataset = dataset.map(extract_characters)

# Function to format the prompt
def format_prompt(example):
    """Create a structured and guiding prompt for story generation."""
    # Extract data from the example
    genre = example["genre"]
    characters = ", ".join(example["characters"])
    story = example["story"]
    title = example["title"]

    # Create the prompt text with a guiding introduction
    prompt = (
        "Create a story based on the given genre and title. "
        f"Genre: {genre}\n"
        f"Title:\n{title}\n\n"
        "Ensure the output includes the following in order:\n"
        "1. Characters\n"
        "2. The story\n"
        "Output:\n"
        f"Characters: {characters}\n"
        f"Story: {story}\n"
    )

    return {"text": prompt}

# Apply formatting and retain only {"text": prompt}
dataset = dataset.map(format_prompt)

# Inspect the final dataset
print(dataset)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Dataset({
    features: ['id', 'title', 'story', 'genre', 'characters', 'text'],
    num_rows: 10
})


In [15]:
from google.colab import userdata
HUGGINGFACE_TOKEN = userdata.get('HF_TOKEN')

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "meta-llama/Llama-3.1-8B"

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type
    bnb_4bit_compute_dtype="float16",  # Compute dtype
    bnb_4bit_use_double_quant=True,  # Apply nested quantization
    llm_int8_enable_fp32_cpu_offload=True,
)

# Load the model to train on the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    # Leave this out for regular SFT
    quantization_config=bnb_config,
    token=HUGGINGFACE_TOKEN
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "right"

In [6]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    lora_alpha=32,  # LoRA Scaling
    lora_dropout=0.1,  # Dropout for LoRA Layers
    r=32,  # Rank
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [7]:
model.get_nb_trainable_parameters()

(83886080, 8114147328)

In [8]:
from transformers import TrainingArguments

output_dir = "./results"

# Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=1,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True
)

In [None]:
for name, module in model.named_modules():
    print(name)

In [17]:
print(dataset["train"][0])  # Inspect the first example in the dataset

{'id': 457580, 'title': 'The Chronicles of the Cosmic Rift', 'story': 'In the year 2250, Earth had made significant strides in space exploration and interstellar travel. The United Earth Government (UEG) had established colonies on Mars, Jupiter\'s moon Europa, and Saturn\'s moon Titan. The advancements in technology and science had led to the creation of the Cosmic Rift Exploration Agency (CREA), a government-funded organization tasked with exploring the unknown regions of space and discovering new worlds and resources.\n\n    Dr. Amelia Hart, a brilliant astrophysicist, was the lead scientist at CREA\'s headquarters on Luna. She had devoted her entire life to understanding the mysteries of the universe and had become a pioneer in her field. She was determined to uncover the secrets of the cosmic rifts, a series of mysterious and seemingly unconnected energy anomalies that had started appearing throughout the galaxy.\n\n    Dr. Hart assembled a diverse team of experts for her next mis

In [9]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [18]:

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(
        example["story"],  # Use the 'story' field for tokenization
        padding="max_length",
        truncation=True,
        max_length=8000,
    )

# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Verify tokenized dataset
print(tokenized_dataset["train"][0])  # Check the tokenized output

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

{'id': 457580, 'title': 'The Chronicles of the Cosmic Rift', 'story': 'In the year 2250, Earth had made significant strides in space exploration and interstellar travel. The United Earth Government (UEG) had established colonies on Mars, Jupiter\'s moon Europa, and Saturn\'s moon Titan. The advancements in technology and science had led to the creation of the Cosmic Rift Exploration Agency (CREA), a government-funded organization tasked with exploring the unknown regions of space and discovering new worlds and resources.\n\n    Dr. Amelia Hart, a brilliant astrophysicist, was the lead scientist at CREA\'s headquarters on Luna. She had devoted her entire life to understanding the mysteries of the universe and had become a pioneer in her field. She was determined to uncover the secrets of the cosmic rifts, a series of mysterious and seemingly unconnected energy anomalies that had started appearing throughout the galaxy.\n\n    Dr. Hart assembled a diverse team of experts for her next mis

In [10]:
from trl import SFTTrainer
import os
from google.colab import output

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=8000,

    # Leave this out for regular SFT
    peft_config=peft_config,
)

# Train model
trainer.train()

trainer.model.save_pretrained("Llama-3.1-8B-qlora")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss


In [11]:
for name, module in trainer.model.named_modules():
    print(name)



base_model
base_model.model
base_model.model.model
base_model.model.model.embed_tokens
base_model.model.model.layers
base_model.model.model.layers.0
base_model.model.model.layers.0.self_attn
base_model.model.model.layers.0.self_attn.q_proj
base_model.model.model.layers.0.self_attn.q_proj.base_layer
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout.default
base_model.model.model.layers.0.self_attn.q_proj.lora_A
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default
base_model.model.model.layers.0.self_attn.q_proj.lora_B
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_A
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_B
base_model.model.model.layers.0.self_attn.q_proj.lora_magnitude_vector
base_model.model.model.layers.0.self_attn.k_proj
base_model.model.model.layers.0.self_attn.k_proj.base_layer
base_model.mode

In [12]:
merged_model = trainer.model.merge_and_unload()



In [None]:
from peft import AutoPeftModelForCausalLM


model = AutoPeftModelForCausalLM.from_pretrained(
  "Llama-3.1-8B-qlora",
    low_cpu_mem_usage=True,
    device_map="auto",
    offload_folder = "offload"
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



KeyError: 'base_model.model.model.model.layers.20.input_layernorm'

In [13]:
from transformers import pipeline

# Use our predefined prompt template
prompt = """Create a story based on the given genre and title.
"""

# Run our instruction-tuned model
pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer)
print(pipe(prompt)[0]["generated_text"])

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Create a story based on the given genre and title.
The story should be at least 500
