In [1]:
!pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
import pandas as pd
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login

from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

## Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Prepare Dataset
df = pd.read_csv("/content/drive/MyDrive/CS8803-Bonus/JEOPARDY_CSV.csv", nrows=1000)
df.columns = [str(q).strip() for q in df.columns]

data = Dataset.from_pandas(df)

In [4]:
data

Dataset({
    features: ['Show Number', 'Air Date', 'Round', 'Category', 'Value', 'Question', 'Answer'],
    num_rows: 1000
})

In [5]:
df["Question"].values[0:5]
data[2]

{'Show Number': 4680,
 'Air Date': '2004-12-31',
 'Round': 'Jeopardy!',
 'Category': 'EVERYBODY TALKS ABOUT IT...',
 'Value': '$200',
 'Question': 'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year',
 'Answer': 'Arizona'}

In [6]:
# Prepare Prompt
prompt = df["Question"].values[0] + ". Answer as briefly as possible: ".strip()
prompt

"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory. Answer as briefly as possible:"

## Model

In [21]:
# Prepare Model
base_model_name = "NousResearch/Llama-2-7b-chat-hf" #"/kaggle/input/llama-2/pytorch/13b-chat-hf/1"

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

In [22]:
def generate_prompt(data_point):
    return f"""
            Answer the question:
            {data_point["Question"]}.
            {data_point["Answer"]}
            """.strip()


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

data = data.shuffle().map(generate_and_tokenize_prompt)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [23]:
# Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#Model - load pretrain checkpoint
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
#prepare model for training
model = prepare_model_for_kbit_training(model)

In [25]:
#Helper fcn for LoRA
import re
def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []

    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names


In [26]:
#Lora config
config = LoraConfig(
    r=2,
    lora_alpha=32,
    target_modules=get_last_layer_linears(model),
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [None]:
'''# Update training arguments
training_args = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    #save_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    logging_dir='/content/drive/MyDrive/CS8803-Bonus/log',  # Directory for storing logs
    logging_steps=10,  # Log every 10 steps
    logging_strategy=IntervalStrategy.STEPS
)
'''

In [13]:

# ft-training
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling, IntervalStrategy, TrainerCallback
import logging

# Custom logging callback class
class CustomLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        logger = logging.getLogger(__name__)
        if logs is not None:
            # Log training loss and other information
            logger.info(f"Step: {state.global_step}, Training Loss: {logs.get('loss', 'N/A')}")
            logger.info(f"Model Hyperparameters: {args}")
            logger.info("Your Name & GTID: Fangtingyu Hu, 903751550")

# Update training arguments
training_args = TrainingArguments(
    # ... (other arguments remain unchanged)
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=True,
    output_dir="finetune_jeopardy",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    report_to="none",
    logging_dir='/content/drive/MyDrive/CS8803-Bonus/logs',  # Directory for storing logs
    logging_steps=10,  # Log every 10 steps
    logging_strategy=IntervalStrategy.STEPS
)

# Set up logging
logging.basicConfig(level=logging.INFO)

# Trainer setup
trainer = Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[CustomLoggingCallback()]
)

# Disable caching for the model
model.config.use_cache = False

# Begin training
trainer.train()

#save finetuning model
model.save_pretrained("/content/drive/MyDrive/CS8803-Bonus/ft-model")


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,4.2066
20,3.9663
30,3.4701
40,3.1682
50,3.1467
60,2.6984
70,2.6658
80,2.7286
90,2.5919
100,2.6578


In [14]:
#save finetuning model
model.save_pretrained("/content/drive/MyDrive/CS8803-Bonus/ft-model")

##Save LoRA weights and Load saved weight

In [None]:
#Function to save LoRA weights
def save_lora_weights(model, save_dir):
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Iterate over model parameters or layers
    for name, param in model.named_parameters():
        # Identify LoRA layers by name or some condition
        if 'lora' in name:
            weight_path = os.path.join(save_dir, f"{name}_weight.pth")
            torch.save(param.data, weight_path)

# Call the function to save LoRA weights
save_dir = "/content/drive/MyDrive/CS8803-Bonus/lora_weights"
save_lora_weights(model, save_dir)

In [None]:
#Implement LoRA weights
import torch
import os

def load_lora_weights(model, load_dir):
    # Iterate over model parameters or layers
    for name, param in model.named_parameters():
        # Check if the parameter is a LoRA layer
        if 'lora' in name:
            weight_path = os.path.join(load_dir, f"{name}_weight.pth")
            if os.path.exists(weight_path):
                # Load the saved weight
                saved_weight = torch.load(weight_path)
                # Ensure the loaded weight is compatible in size
                if saved_weight.shape == param.data.shape:
                    param.data = saved_weight
                else:
                    print(f"Shape mismatch for {name}, cannot load weight.")
            else:
                print(f"No saved weight found for {name} at {weight_path}")

# Path to the directory where LoRA weights are saved
load_dir = "/content/drive/MyDrive/CS8803-Bonus/lora_weights"

# Load the base model as per your original code
#Model - load pretrain checkpoint
model = AutoModelForCausalLM.from_pretrained(
    base_model_name
    #device_map="auto",
    #trust_remote_code=True,
)

# Load the LoRA weights into the model
load_lora_weights(model, load_dir)

# Now the model is ready with LoRA weights for inference or further training



## Loading and using ft model

In [None]:
PEFT_MODEL = "/content/drive/MyDrive/CS8803-Bonus/ft-model"

In [None]:
# Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
#Load saved ft model

PEFT_MODEL = "/content/drive/MyDrive/CS8803-Bonus/ft-model"

config = PeftConfig.from_pretrained(PEFT_MODEL)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
#torch.save(model.lora_parameters, 'lora_weights.pth')

## Generate text

In [None]:
#Generate config
generation_config = model.generation_config
generation_config.max_new_tokens = 8 #10
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
%%time

#prompt = "In which sport are barani, rudolph, and randolph all techniques?. Answer as briefly as possible: ".strip()
prompt = "Who is the president of United States? Answer as briefly as possible:".strip()
device = "cuda"
encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Who is the president of United States? Answer as briefly as possible:
            Joe Biden.

CPU times: user 3.14 s, sys: 727 ms, total: 3.87 s
Wall time: 4.73 s


In [None]:
#Define model inference function
#input: test.txt
#output: test-output.txt

def model_inference(input_file, output_file, model, tokenizer, device):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    responses = []
    for line in lines:
        print('line',line)
        prompt = line + "Answer as briefly as possible:"
        prompt = prompt.strip()
        print('prompt',prompt)
        encoding = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.inference_mode():
            outputs = model.generate(
                input_ids=encoding.input_ids,
                attention_mask=encoding.attention_mask,
                generation_config = generation_config
                # Add any additional parameters needed for your model here
            )
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print('full_response',full_response)
        # Split the response to isolate the part after your prompt
        split_response = full_response.split("Answer as briefly as possible: ")
        answer = split_response[1].split('.')[0].strip()
        print('answer',answer)
        responses.append(answer)

    print(responses)
    with open(output_file, 'w') as file:
        for response in responses:
          #print(response)

          file.write(response + '\n')

# Usage
# Assuming the model and tokenizer are already defined and configured
# and 'cuda' or 'cpu' is set appropriately for your setup
model_inference('/content/drive/MyDrive/CS8803-Bonus/test.txt', '/content/drive/MyDrive/CS8803-Bonus/test-output.txt', model, tokenizer, 'cuda')


line Who is the president of United States?

prompt Who is the president of United States?
Answer as briefly as possible:
full_response Who is the president of United States?
Answer as briefly as possible: Joe Biden.
Joe B
answer Joe Biden
line Which city is the capital of PRC?

prompt Which city is the capital of PRC?
Answer as briefly as possible:
full_response Which city is the capital of PRC?
Answer as briefly as possible: Beijing.
2. The
answer Beijing
line 1+1=?

prompt 1+1=?
Answer as briefly as possible:
full_response 1+1=?
Answer as briefly as possible: 2.

2. 
answer 2
['Joe Biden', 'Beijing', '2']
