In [1]:
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U accelerate
!pip install -U datasets>=2.16.1
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install -U peft==0.8.2
# !pip install -U trl==0.7.10
!pip install -U wandb==0.16.3
!pip install -U huggingface_hub==0.20.3


Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /scratch/5404940.1.academic-gpu/pip-req-build-b64xtmme
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /scratch/5404940.1.academic-gpu/pip-req-build-b64xtmme


  Resolved https://github.com/huggingface/transformers.git to commit 8e64ba2890bd3231916cddcec77ba6331c306031
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting accelerate
  Using cached accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.28.0
Looking in indexes: https://pypi.org/simple/
Collecting bitsandbytes
  Using cached bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl.metadata (1.8 kB)
Using cached bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (102.2 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.0
Collecting peft==0.8.2
  Using cached peft-0.8.2

In [2]:
!python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"


Configuration already exists at /usr4/cs640/charoori/.cache/huggingface/accelerate/default_config.yaml, will not override. Run `accelerate config` manually or pass a different `save_location`.


In [3]:
import os
# Specify your desired directory for the transformers cache
cache_dir = r"/projectnb/ds598/students/charoori"

# Set the environment variable for this session and for any subprocesses launched from here
os.environ["TRANSFORMERS_CACHE"] = cache_dir

In [43]:
# Code adapted from https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
# and https://huggingface.co/blog/gemma-peft
import argparse
import multiprocessing
import os

import torch
import transformers
from transformers import Trainer
from datasets import Dataset
from accelerate import PartialState
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    logging,
    set_seed,
)
# from trl import SFTTrainer


def get_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--model_id", type=str, default="bigcode/starcoder2-3b")
    # parser.add_argument("--dataset_name", type=str, default="devesh5/codeconv-fortran-to-rust")
    parser.add_argument("--dataset_name", type=str, default="bigcode/the-stack-smol-xs")
    parser.add_argument("--subset", type=str, default="data/rust")
    parser.add_argument("--split", type=str, default="train")
    parser.add_argument("--dataset_text_field", type=str, default="content")

    parser.add_argument("--max_seq_length", type=int, default=1024)
    parser.add_argument("--max_steps", type=int, default=10)
    parser.add_argument("--micro_batch_size", type=int, default=1)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=4)
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--bf16", type=bool, default=True)

    parser.add_argument("--attention_dropout", type=float, default=0.1)
    parser.add_argument("--learning_rate", type=float, default=2e-4)
    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
    parser.add_argument("--warmup_steps", type=int, default=100)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--output_dir", type=str, default="finetune_starcoder2")
    parser.add_argument("--num_proc", type=int, default=None)
    parser.add_argument("--push_to_hub", type=bool, default=True)
    return parser.parse_args(args=[])


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def main(args):
    # config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    lora_config = LoraConfig(
        r=8,
        target_modules=[
            "q_proj",
            "o_proj",
            "k_proj",
            "v_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        task_type="CAUSAL_LM",
    )

    # load model and dataset
    token = os.getenv("HF_TOKEN")
    print(args.model_id)
    model = AutoModelForCausalLM.from_pretrained(
        args.model_id,
        quantization_config=bnb_config,
        device_map={"": PartialState().process_index},
        attention_dropout=args.attention_dropout,
    )
    print_trainable_parameters(model)

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    data = load_dataset(
        args.dataset_name,
        # data_dir=args.subset,
        split=args.split,
        token=token,
        # num_proc=args.num_proc if args.num_proc else multiprocessing.cpu_count(),
    ) 

    # data = data['content']
    # data = Dataset.from_dict({'content': data})
    # data["content"] = data["content"].shuffle().map(generate_and_tokenize_prompt, batched = False)

    


    # setup the trainer
    trainer = Trainer(
        model=model,
        train_dataset=data,
        # max_seq_length=args.max_seq_length,
        args=transformers.TrainingArguments(
            # per_device_train_batch_size=args.micro_batch_size,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            warmup_steps=args.warmup_steps,
            max_steps=args.max_steps,
            learning_rate=args.learning_rate,
            lr_scheduler_type=args.lr_scheduler_type,
            weight_decay=args.weight_decay,
            fp16 =True,
            logging_strategy="steps",
            logging_steps=1,
            output_dir=args.output_dir,
            optim="paged_adamw_8bit",
            seed=args.seed,
            run_name=f"train-{args.model_id.split('/')[-1]}",
            report_to="wandb",
            # remove_unused_columns=False
        ),
        # peft_config=lora_config,
        # dataset_text_field=args.dataset_text_field,
    )

    # launch
    print("Training...")
    trainer.train()

    print("Saving the last checkpoint of the model")
    model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
    if args.push_to_hub:
        trainer.push_to_hub("Upload model")
    print("Training Done! 💥")


args = get_args()
set_seed(args.seed)
os.makedirs(args.output_dir, exist_ok=True)

logging.set_verbosity_error()

main(args)


bigcode/starcoder2-3b
trainable params: 151369728 || all params: 1591200768 || trainable%: 9.5129245186488
trainable params: 4,546,560 || all params: 3,034,917,888 || trainable%: 0.14980833642903488


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training...


IndexError: Invalid key: 44 is out of bounds for size 0

In [29]:

token = os.getenv("HF_TOKEN")
data = load_dataset(
    args.dataset_name,
    # data_dir=args.subset,
    split=args.split,
    token=token,
    num_proc=args.num_proc if args.num_proc else multiprocessing.cpu_count(),
) 
data_new = data.map(lambda example: {'content': example['content']})
data_new = data['content']


In [30]:
data_new

['Program to be Translated :\nprogram a\n\nreal :: m1, d1, m2, d2\n\n! M1, D1を入力\n\tread *, m1, d1\n    read *, m2, d2\n    \n    if (m1 /= m2) then\n    \tprint *, \'1\'\n    else\n    \tprint *, \'0\'\n    end if\nend program a\nProgram Explanation :\n\n\nThe provided Fortran code snippet is a simple program that compares two pairs of real numbers and prints a result based on their equality.\n\nThe code begins with the declaration of four real variables: `m1`, `d1`, `m2`, and `d2`. These variables are used to store the values entered by the user.\n\nThe program then reads two pairs of real numbers from the user, storing them in the variables `m1` and `d1` for the first pair and `m2` and `d2` for the second pair.\n\nAfter reading the input, the program checks if the values of `m1` and `m2` are equal. If they are not equal, it prints the value `1` to the console. If they are equal, it prints the value `0` to the console.\n\nThe program ends with the `end program a` statement, indicatin

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-3b")
model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder2-3b")
device = "cuda"

inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))