In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import transformers
import torch
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AutoTokenizer, TextStreamer
import pandas as pd

2023-10-12 03:38:21.739624: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-12 03:38:21.763189: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Quantized Model w/ PEFT

In [2]:
model_id = "Trelis/Llama-2-7b-chat-hf-sharded-bf16-5GB" # sharded model by vilsonrodrigues
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [3]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [4]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 39976960 || all params: 3540389888 || trainable%: 1.1291682911958425


# Load Dataset + Splitting (Train,Val)
# Load Dataset

In [8]:
train_dataset = pd.read_excel('./jsonTrainDatasetFinal.xlsx')
train_dataset.drop(train_dataset.columns[[0, 1]], axis=1, inplace=True)
train_dataset.head()
#df.head()

Unnamed: 0,script,json
0,"Once, in a small coastal town named Cresthaven...","{\n ""Scene 1"": {\n ""Actions"": [\n ""In..."
1,"Title: ""The Chronicles of Lumina""\n\nOnce upon...","\n{\n\n ""Scene 1"": {""Actions"":[""Seraphina w..."
2,"Title: ""The Forgotten Melody""\n\nIn the heart ...","\n{\n\n ""Scene 1"": {""Actions"":[""Amelia ente..."
3,"Title: ""The Time Traveler's Dilemma""\n\nIn the...","\n{\n\n ""Scene 1"": {""Actions"":[""Lucas finds..."
4,"Title: ""The Lighthouse Keeper's Promise""\n\nTh...","\n{\n\n ""Scene 1"": {""Actions"":[""Waves crash..."


In [9]:
train_dataset_list = []
for i,row in train_dataset.iterrows():
    story = row['script']
    prompt = """[INST]
        <<SYS>>\n You are a bot that reads a story and returns a JSON file containing the actions performed throughout the story along with the environmented they take place in divided by scene.
        Divide the scenes based on the change in environment.
        Make sure you stop creating when you conclude the story. Do not make up your own story.
        Always use the names of the characters when you mention them, avoid any ambiguity.<</SYS>>\n
        Create a JSON file containing the description of dividing the story into scenes, and each scene containing the actions being performed throughout the story(which actors are performing what actions, what are their reactions etc.), along with the environment name(City, Location etc.). 
        Divide the scenes based on the change in environment.
        Always use the names of the characters, to avoid ambiguity, describing all the characters that are taking part in any particular action.
        {{#block hidden = True}}
        This is the example format of the JSON-
        {"Scene 1": {
            "Actions": ["John was sitting at the table",
                        "John was having a chat with Smith",
                        "Smith was jumping up and down the table"
                        ],
            "Env" : "Dining Room"
            },
            "Scene 2": {
            "Actions": ["John was driving his car",
                        "John met with an accident",
                        "Smith in the passenger seat flew out of the car"
                        ],
            "Env" : "City"
            },
        }
        Create the JSON for this Story = """ + story + """ 
        [/INST]
        Here is the JSON file in the requested format-
        ```json
        """ + row['json'] + "```"
    train_dataset_list.append(prompt)

In [10]:
train_dataset_list[0:2]

['[INST]\n        <<SYS>>\n You are a bot that reads a story and returns a JSON file containing the actions performed throughout the story along with the environmented they take place in divided by scene.\n        Divide the scenes based on the change in environment.\n        Make sure you stop creating when you conclude the story. Do not make up your own story.\n        Always use the names of the characters when you mention them, avoid any ambiguity.<</SYS>>\n\n        Create a JSON file containing the description of dividing the story into scenes, and each scene containing the actions being performed throughout the story(which actors are performing what actions, what are their reactions etc.), along with the environment name(City, Location etc.). \n        Divide the scenes based on the change in environment.\n        Always use the names of the characters, to avoid ambiguity, describing all the characters that are taking part in any particular action.\n        {{#block hidden = Tru

# Tokenize Dataset

In [11]:
train_encodings = tokenizer(train_dataset_list, truncation=True, padding=True, max_length=300, return_tensors='pt')
#val_encodings = tokenizer(val_dataset, truncation=True, padding=True, max_length=300, return_tensors='pt')

# Convert to PyTorch

In [12]:
class TextDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [13]:
train_dataset = TextDataset(train_encodings)
#val_dataset = TextDataset(val_encodings)

# Define A Generate Function

In [14]:
def generate(input):
    encoding = tokenizer(input, return_tensors="pt").to("cuda:0")
    streamer = TextStreamer(tokenizer, skip_prompt = True, skip_special_tokens= True)
    model.generate(input_ids=encoding.input_ids, streamer = streamer,attention_mask=encoding.attention_mask, max_new_tokens=4096, do_sample=True, temperature=0.000001, eos_token_id=tokenizer.eos_token_id, top_k = 0)


# Training

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    #eval_dataset=val_dataset,
    args=transformers.TrainingArguments(
        num_train_epochs=10,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        warmup_ratio=0.05,
        # max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        lr_scheduler_type='cosine',
        report_to=none,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# Save the LoRA

In [None]:
model.save_pretrained("fine-tuned-llama7b-10epochs")