In [32]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [33]:
%pip install torch peft bitsandbytes transformers trl accelerate einops tqdm scipy

Note: you may need to restart the kernel to use updated packages.


In [34]:
import os
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer

In [35]:
dataset = load_dataset("MH0386/napoleon_bonaparte", data_files="napoleon_prompt_format.json")

In [36]:
dataset

DatasetDict({
    train: Dataset({
        features: ['A', 'Q'],
        num_rows: 10097
    })
})

In [37]:
import pandas as pd

# Convert json to DataFrame
df = pd.DataFrame(dataset['train'])

# Display the first few rows of the DataFrame
df

Unnamed: 0,A,Q
0,"napoleon was born in ajaccio, corsica, on 15 a...",when and where was napoleon born?
1,napoleon began his education at autun and late...,what was napoleon's early education like?
2,napoleon was promoted to brigadier general for...,what was napoleon's role during the french rev...
3,"napoleon enjoyed a succession of victories, wh...",what did napoleon accomplish during his campai...
4,the naval defeat at aboukir bay isolated the e...,what was the outcome of napoleon's expedition ...
...,...,...
10092,born to carlo buonaparte and letizia romalino ...,explain napoleon's family background and early...
10093,during the early years of the french revolutio...,explain napoleon's involvement in the french r...
10094,"in 1799, during napoleon's campaign in egypt, ...",discuss napoleon's military campaign in egypt ...
10095,"in 1796, he defeated austria in italy, and in ...","discuss napoleon's military victories, includi..."


In [38]:
# Function to transform the row into desired format
def format_row(row):
    question = row['Q']
    answer = row['A']
    formatted_string = f"[INST] {question} [/INST] {answer} "
    return formatted_string


# Apply the function to each row of the dataframe
df['Formatted'] = df.apply(format_row, axis=1)

# Display the formatted column
df['Formatted']

0        [INST] when and where was napoleon born? [/INS...
1        [INST] what was napoleon's early education lik...
2        [INST] what was napoleon's role during the fre...
3        [INST] what did napoleon accomplish during his...
4        [INST] what was the outcome of napoleon's expe...
                               ...                        
10092    [INST] explain napoleon's family background an...
10093    [INST] explain napoleon's involvement in the f...
10094    [INST] discuss napoleon's military campaign in...
10095    [INST] discuss napoleon's military victories, ...
10096    [INST] explain napoleon's decision to invade e...
Name: Formatted, Length: 10097, dtype: object

In [39]:
# Rename the 'Formatted' column to 'Text'
new_df = df.rename(columns={'Formatted': 'Text'})

new_df

Unnamed: 0,A,Q,Text
0,"napoleon was born in ajaccio, corsica, on 15 a...",when and where was napoleon born?,[INST] when and where was napoleon born? [/INS...
1,napoleon began his education at autun and late...,what was napoleon's early education like?,[INST] what was napoleon's early education lik...
2,napoleon was promoted to brigadier general for...,what was napoleon's role during the french rev...,[INST] what was napoleon's role during the fre...
3,"napoleon enjoyed a succession of victories, wh...",what did napoleon accomplish during his campai...,[INST] what did napoleon accomplish during his...
4,the naval defeat at aboukir bay isolated the e...,what was the outcome of napoleon's expedition ...,[INST] what was the outcome of napoleon's expe...
...,...,...,...
10092,born to carlo buonaparte and letizia romalino ...,explain napoleon's family background and early...,[INST] explain napoleon's family background an...
10093,during the early years of the french revolutio...,explain napoleon's involvement in the french r...,[INST] explain napoleon's involvement in the f...
10094,"in 1799, during napoleon's campaign in egypt, ...",discuss napoleon's military campaign in egypt ...,[INST] discuss napoleon's military campaign in...
10095,"in 1796, he defeated austria in italy, and in ...","discuss napoleon's military victories, includi...","[INST] discuss napoleon's military victories, ..."


In [40]:
new_df = new_df[['Text']]

In [41]:
new_df.head(3)

Unnamed: 0,Text
0,[INST] when and where was napoleon born? [/INS...
1,[INST] what was napoleon's early education lik...
2,[INST] what was napoleon's role during the fre...


In [42]:
# If you want to save the new dataframe to a CSV file:
new_df.to_csv('formatted_data.csv', index=False)

In [43]:
final_df = pd.read_csv("formatted_data.csv")

In [44]:
final_df.head(2)

Unnamed: 0,Text
0,[INST] when and where was napoleon born? [/INS...
1,[INST] what was napoleon's early education lik...


In [45]:
training_dataset = load_dataset("csv", data_files="formatted_data.csv", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [46]:
training_dataset

Dataset({
    features: ['Text'],
    num_rows: 10097
})

In [47]:
base_model = "microsoft/phi-2"
new_model = "phi-2-mental-health"

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    # use_flash_attention_2=True, # Phi does not support yet.
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    evaluation_strategy="steps",
    eval_steps=2000,
    logging_steps=15,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_steps=2000,
    warmup_ratio=0.05,
    weight_decay=0.01,
    max_steps=-1
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["Wqkv", "fc1", "fc2"]
    # ["Wqkv", "out_proj", "fc1", "fc2" ], - 41M params
    # modules_to_save=["embed_tokens","lm_head"]
)

trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    peft_config=peft_config,
    dataset_text_field="Text",
    max_seq_length=690,
    tokenizer=tokenizer,
    args=training_arguments,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
trainer.train()

In [None]:
from transformers import pipeline

In [None]:
# Run text generation pipeline with our next model
prompt = "I am not able to sleep in night. Do you have any suggestions?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=250)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
del model
del pipe
del trainer
import gc

gc.collect()
gc.collect()

In [None]:
model_name = "microsoft/phi-2"

In [None]:
from peft import PeftModel

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='cuda',
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"