## Code - Kotlin Code: Prompt Tuning with StarCoder

Dataset Source: https://huggingface.co/datasets/codkiller0911/kotlin_code

#### Install Necessary Libraries

In [1]:
%pip install peft transformers datasets bitsandbytes
%pip install accelerate -U

Collecting peft
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from peft)
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m22.2 MB/s[0m

#### Enter HuggingFace Access Token

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


#### Import Necessary Libraries

In [3]:
import os, sys
os.environ['TOKENIZERS_PARALLELISM']='false'

import torch
from torch.utils.data import DataLoader
from torch import nn
torch.cuda.empty_cache()

from tqdm import tqdm

import datasets
from datasets import load_dataset, DatasetDict

import gc
gc.collect()

import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup
)

import peft
from peft import (
    get_peft_config,
    get_peft_model,
    PromptTuningInit,
    PromptTuningConfig,
    TaskType,
    PeftType
)

import bitsandbytes as bnb

!git lfs install

Git LFS initialized.


#### Display Library Versions

In [4]:
library_len = 14
version_len = 12

print(f"+{'-' * (library_len + version_len + 5)}+")
print("|",
      "Library".rjust(library_len),
      "|",
      "Version".ljust(version_len),
      "|")

print(f"|{'*' * (library_len + version_len + 5)}|")

print("|",
      "Python".rjust(library_len),
      "|",
      sys.version[0:6].ljust(version_len),
      "|")

print("|",
      "Torch".rjust(library_len),
      "|",
      torch.__version__.ljust(version_len),
      "|")

print("|",
      "Datasets".rjust(library_len),
      "|",
      datasets.__version__.ljust(version_len),
      "|")

print("|",
      "Transformer".rjust(library_len),
      "|",
      transformers.__version__.ljust(version_len),
      "|")

print("|",
      "PEFT".rjust(library_len),
      "|",
      peft.__version__.ljust(version_len),
      "|")

print(f"+{'-' * (library_len + version_len + 5)}+")

+-------------------------------+
|        Library | Version      |
|*******************************|
|         Python | 3.10.1       |
|          Torch | 2.0.1+cu118  |
|       Datasets | 2.14.4       |
|    Transformer | 4.31.0       |
|           PEFT | 0.4.0        |
+-------------------------------+


#### Basic Values/Constants

In [5]:
DATASET_NAME = "codkiller0911/kotlin_code"
MODEL_CKPT = "bigcode/starcoderbase-1b"

MODEL_NAME = f"{MODEL_CKPT.split('/')[-1]}-Prompt_Tuned-{DATASET_NAME.split('/')[-1]}"
LR = 3e-2

MAX_LENGTH = 120
NUM_OF_EPOCHS = 28

TEXT_COLUMN = "Code_Function"
LABEL_COLUMN = "Message"

BATCH_SIZE = 8
DEVICE = torch.device("cuda")

#### Define Peft Configuration

In [6]:
peft_configuration = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Generate Document Strings to Summarize what the Code Function does:",
    tokenizer_name_or_path=MODEL_CKPT,
)

#### Load Dataset

In [7]:
data = load_dataset(DATASET_NAME)

print(data)

print(data['train'][12])

Downloading readme:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/757k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/210k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Code_Function', 'Message'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['Code_Function', 'Message'],
        num_rows: 500
    })
})
{'Code_Function': 'fun Yaml( constructor: BaseConstructor, representer: Representer, dumperOptions: DumperOptions, resolver: Resolver ) { if (!constructor.isExplicitPropertyUtils()) { constructor.setPropertyUtils(representer.getPropertyUtils()) } else if (!representer.isExplicitPropertyUtils()) { representer.setPropertyUtils(constructor.getPropertyUtils()) } constructor = constructor representer.setDefaultFlowStyle(dumperOptions.getDefaultFlowStyle()) representer.setDefaultScalarStyle(dumperOptions.getDefaultScalarStyle()) representer.getPropertyUtils().setAllowReadOnlyProperties(dumperOptions.isAllowReadOnlyProperties()) representer.setTimeZone(dumperOptions.getTimeZone()) representer = representer dumperOptions = dumperOptions resolver = resolver this.name = "Yaml:" + System

#### Determine Maximum Length of Tokenized Labels

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

target_max_length = max([len(tokenizer(label)["input_ids"])
                         for label in data['train']['Message']])

print(f"The maximum tokenized response length is {target_max_length}")

Downloading (…)okenizer_config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/532 [00:00<?, ?B/s]

The maximum tokenized response length is 391


#### Remove Overly Length Samples

In [9]:
max_value = 121

def reduce_max_length(dataset, max_tokenized_length):
    return dataset.select(
        (
            sample for sample in range(len(dataset))
            if len(tokenizer(dataset[sample]['Message'])['input_ids']) < max_tokenized_length
        )
    )

data['train'] = reduce_max_length(data['train'], max_value)
data['test'] = reduce_max_length(data['test'], max_value)

data.shape



{'train': (1584, 2), 'test': (498, 2)}

#### Create Function to Preprocess Dataset

It will:
- tokenize the entire dataset
- for each example in a batch, pad the labels with the tokenizers pad_token_id
- concatenate the input text & labels to form model_inputs
- create separate attention mask for labels & model_inputs
- loop through each example in the batch again to pad the input_ids, labels, and attention_mask to the max_length & convert them to PyTorch tensors.

In [10]:
def function_to_preprocess_data(examples):
    batch_size = len(examples[TEXT_COLUMN])
    inputs = [f"{TEXT_COLUMN} : {x} Label : " for x in examples[TEXT_COLUMN]]
    targets = [str(x) for x in examples[LABEL_COLUMN]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]

        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids

        model_inputs["attention_mask"][i] = [1] * len(
            model_inputs["input_ids"][i]
            )

    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            MAX_LENGTH - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (
            MAX_LENGTH - len(sample_input_ids)
            ) + model_inputs["attention_mask"][i]

        labels["input_ids"][i] = [-100] * (
            MAX_LENGTH - len(sample_input_ids)
            ) + label_input_ids

        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][:MAX_LENGTH]
            )

        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][:MAX_LENGTH]
            )

        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:MAX_LENGTH])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#### Map Preprocessing Function to Entire Dataset

In [11]:
encoded_data = data.map(
    function_to_preprocess_data,
    batched=True,
    num_proc=1,
    remove_columns=data["train"].column_names,
    load_from_cache_file=False,
    desc="Tokenizing Dataset",
)

del data

print("Training Dataset Shape:", encoded_data['train'].shape)
print("Evaluation Dataset Shape:", encoded_data['test'].shape)

Tokenizing Dataset:   0%|          | 0/1584 [00:00<?, ? examples/s]

Tokenizing Dataset:   0%|          | 0/498 [00:00<?, ? examples/s]

Training Dataset Shape: (1584, 3)
Evaluation Dataset Shape: (498, 3)


#### Create DataLoaders for Both Training & Evaluation Datasets

In [12]:
train_ds = encoded_data['train']
eval_ds = encoded_data['test']

del encoded_data

train_dataloader = DataLoader(train_ds,
                              shuffle=True,
                              collate_fn=default_data_collator,
                              batch_size=BATCH_SIZE,
                              pin_memory=True,
                              )

eval_dataloader = DataLoader(eval_ds,
                             shuffle=True,
                             collate_fn=default_data_collator,
                             batch_size=BATCH_SIZE,
                             pin_memory=True,
                             )

#### Define Model

In [13]:
model = AutoModelForCausalLM.from_pretrained(MODEL_CKPT,
                                             load_in_8bit=True,
                                             device_map="auto")
model = get_peft_model(model,
                       peft_configuration)

for param in model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float16)

model.lm_head = CastOutputToFloat(model.lm_head)

print("Model Trainable Parameters: ")
print(model.print_trainable_parameters())

print("Model Memory Footprint: ")
print(model.get_memory_footprint())

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Some weights of GPTBigCodeForCausalLM were not initialized from the model checkpoint at bigcode/starcoderbase-1b and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Model Trainable Parameters: 
trainable params: 0 || all params: 1,137,223,680 || trainable%: 0.0
None
Model Memory Footprint: 
1323409408


#### Define Optimizer & Learning Rate Scheduler

In [14]:
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=LR)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * NUM_OF_EPOCHS),
    )

#### Define Training Loop

In [15]:
model.to(DEVICE)

with torch.autocast("cuda"):
    for epoch in range(NUM_OF_EPOCHS):
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        eval_loss = 0
        eval_preds = []
        for step, batch in enumerate(tqdm(eval_dataloader)):
            batch = {k: v.to(DEVICE) for k,v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.detach().float()
            eval_preds.extend(
                tokenizer.batch_decode(
                    torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
                    skip_special_tokens=True)
                )

        eval_epoch_loss = eval_loss / len(eval_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=}{train_epoch_loss=}{eval_ppl=}{eval_epoch_loss=}")

100%|██████████| 198/198 [02:30<00:00,  1.31it/s]
100%|██████████| 63/63 [00:14<00:00,  4.35it/s]


epoch=0: train_ppl=tensor(47.1636, device='cuda:0')train_epoch_loss=tensor(3.8536, device='cuda:0')eval_ppl=tensor(35.1414, device='cuda:0')eval_epoch_loss=tensor(3.5594, device='cuda:0')


100%|██████████| 198/198 [02:36<00:00,  1.27it/s]
100%|██████████| 63/63 [00:14<00:00,  4.32it/s]


epoch=1: train_ppl=tensor(48.1105, device='cuda:0')train_epoch_loss=tensor(3.8735, device='cuda:0')eval_ppl=tensor(35.5366, device='cuda:0')eval_epoch_loss=tensor(3.5706, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.33it/s]


epoch=2: train_ppl=tensor(46.8669, device='cuda:0')train_epoch_loss=tensor(3.8473, device='cuda:0')eval_ppl=tensor(34.4511, device='cuda:0')eval_epoch_loss=tensor(3.5395, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.33it/s]


epoch=3: train_ppl=tensor(47.3842, device='cuda:0')train_epoch_loss=tensor(3.8583, device='cuda:0')eval_ppl=tensor(34.0481, device='cuda:0')eval_epoch_loss=tensor(3.5278, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.37it/s]


epoch=4: train_ppl=tensor(48.6001, device='cuda:0')train_epoch_loss=tensor(3.8836, device='cuda:0')eval_ppl=tensor(35.8359, device='cuda:0')eval_epoch_loss=tensor(3.5790, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.33it/s]


epoch=5: train_ppl=tensor(47.7137, device='cuda:0')train_epoch_loss=tensor(3.8652, device='cuda:0')eval_ppl=tensor(34.5115, device='cuda:0')eval_epoch_loss=tensor(3.5413, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.39it/s]


epoch=6: train_ppl=tensor(46.9919, device='cuda:0')train_epoch_loss=tensor(3.8500, device='cuda:0')eval_ppl=tensor(34.5057, device='cuda:0')eval_epoch_loss=tensor(3.5411, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.28it/s]


epoch=7: train_ppl=tensor(47.6653, device='cuda:0')train_epoch_loss=tensor(3.8642, device='cuda:0')eval_ppl=tensor(35.7987, device='cuda:0')eval_epoch_loss=tensor(3.5779, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.34it/s]
100%|██████████| 63/63 [00:15<00:00,  4.03it/s]


epoch=8: train_ppl=tensor(46.9859, device='cuda:0')train_epoch_loss=tensor(3.8498, device='cuda:0')eval_ppl=tensor(34.4982, device='cuda:0')eval_epoch_loss=tensor(3.5409, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.29it/s]


epoch=9: train_ppl=tensor(46.6664, device='cuda:0')train_epoch_loss=tensor(3.8430, device='cuda:0')eval_ppl=tensor(nan, device='cuda:0')eval_epoch_loss=tensor(nan, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.26it/s]


epoch=10: train_ppl=tensor(46.5848, device='cuda:0')train_epoch_loss=tensor(3.8413, device='cuda:0')eval_ppl=tensor(35.5663, device='cuda:0')eval_epoch_loss=tensor(3.5714, device='cuda:0')


100%|██████████| 198/198 [02:29<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.27it/s]


epoch=11: train_ppl=tensor(47.8784, device='cuda:0')train_epoch_loss=tensor(3.8687, device='cuda:0')eval_ppl=tensor(34.2963, device='cuda:0')eval_epoch_loss=tensor(3.5350, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.29it/s]


epoch=12: train_ppl=tensor(47.9711, device='cuda:0')train_epoch_loss=tensor(3.8706, device='cuda:0')eval_ppl=tensor(34.7372, device='cuda:0')eval_epoch_loss=tensor(3.5478, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.37it/s]


epoch=13: train_ppl=tensor(46.8871, device='cuda:0')train_epoch_loss=tensor(3.8477, device='cuda:0')eval_ppl=tensor(35.1089, device='cuda:0')eval_epoch_loss=tensor(3.5585, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.35it/s]


epoch=14: train_ppl=tensor(48.7484, device='cuda:0')train_epoch_loss=tensor(3.8867, device='cuda:0')eval_ppl=tensor(nan, device='cuda:0')eval_epoch_loss=tensor(nan, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.36it/s]


epoch=15: train_ppl=tensor(47.2218, device='cuda:0')train_epoch_loss=tensor(3.8549, device='cuda:0')eval_ppl=tensor(36.4986, device='cuda:0')eval_epoch_loss=tensor(3.5973, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.36it/s]


epoch=16: train_ppl=tensor(46.3828, device='cuda:0')train_epoch_loss=tensor(3.8369, device='cuda:0')eval_ppl=tensor(35.7737, device='cuda:0')eval_epoch_loss=tensor(3.5772, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.31it/s]


epoch=17: train_ppl=tensor(47.2795, device='cuda:0')train_epoch_loss=tensor(3.8561, device='cuda:0')eval_ppl=tensor(35.3809, device='cuda:0')eval_epoch_loss=tensor(3.5662, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.30it/s]


epoch=18: train_ppl=tensor(47.8024, device='cuda:0')train_epoch_loss=tensor(3.8671, device='cuda:0')eval_ppl=tensor(nan, device='cuda:0')eval_epoch_loss=tensor(nan, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.33it/s]
100%|██████████| 63/63 [00:14<00:00,  4.34it/s]


epoch=19: train_ppl=tensor(48.3327, device='cuda:0')train_epoch_loss=tensor(3.8781, device='cuda:0')eval_ppl=tensor(37.0658, device='cuda:0')eval_epoch_loss=tensor(3.6127, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.38it/s]


epoch=20: train_ppl=tensor(48.2030, device='cuda:0')train_epoch_loss=tensor(3.8754, device='cuda:0')eval_ppl=tensor(36.1963, device='cuda:0')eval_epoch_loss=tensor(3.5890, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.31it/s]


epoch=21: train_ppl=tensor(47.4127, device='cuda:0')train_epoch_loss=tensor(3.8589, device='cuda:0')eval_ppl=tensor(34.6317, device='cuda:0')eval_epoch_loss=tensor(3.5448, device='cuda:0')


100%|██████████| 198/198 [02:28<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.35it/s]


epoch=22: train_ppl=tensor(48.4586, device='cuda:0')train_epoch_loss=tensor(3.8807, device='cuda:0')eval_ppl=tensor(34.5292, device='cuda:0')eval_epoch_loss=tensor(3.5418, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.40it/s]


epoch=23: train_ppl=tensor(46.5764, device='cuda:0')train_epoch_loss=tensor(3.8411, device='cuda:0')eval_ppl=tensor(35.0186, device='cuda:0')eval_epoch_loss=tensor(3.5559, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.33it/s]


epoch=24: train_ppl=tensor(47.5066, device='cuda:0')train_epoch_loss=tensor(3.8609, device='cuda:0')eval_ppl=tensor(35.3040, device='cuda:0')eval_epoch_loss=tensor(3.5640, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.35it/s]
100%|██████████| 63/63 [00:14<00:00,  4.39it/s]


epoch=25: train_ppl=tensor(47.6764, device='cuda:0')train_epoch_loss=tensor(3.8644, device='cuda:0')eval_ppl=tensor(34.9161, device='cuda:0')eval_epoch_loss=tensor(3.5529, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.39it/s]


epoch=26: train_ppl=tensor(47.0149, device='cuda:0')train_epoch_loss=tensor(3.8505, device='cuda:0')eval_ppl=tensor(35.2436, device='cuda:0')eval_epoch_loss=tensor(3.5623, device='cuda:0')


100%|██████████| 198/198 [02:27<00:00,  1.34it/s]
100%|██████████| 63/63 [00:14<00:00,  4.35it/s]

epoch=27: train_ppl=tensor(47.6852, device='cuda:0')train_epoch_loss=tensor(3.8646, device='cuda:0')eval_ppl=tensor(34.6201, device='cuda:0')eval_epoch_loss=tensor(3.5444, device='cuda:0')





#### Push Model to Hub

In [17]:
model.push_to_hub(MODEL_NAME)

adapter_model.bin:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DunnBC22/starcoderbase-1b-Prompt_Tuned-kotlin_code/commit/a07b86595771eb17b9051d27371bb63200be3f44', commit_message='Upload model', commit_description='', oid='a07b86595771eb17b9051d27371bb63200be3f44', pr_url=None, pr_revision=None, pr_num=None)

### Notes & Other Takeaways From This Project

****
- The results are not what I wanted, but I will improve.

****

### Citations

- Model Checkpoint
    > @article{li2023starcoder, title={StarCoder: may the source be with you!}, author={Raymond Li and Loubna Ben Allal and Yangtian Zi and Niklas Muennighoff and Denis Kocetkov and Chenghao Mou and Marc Marone and Christopher Akiki and Jia Li and Jenny Chim and Qian Liu and Evgenii Zheltonozhskii and Terry Yue Zhuo and Thomas Wang and Olivier Dehaene and Mishig Davaadorj and Joel Lamy-Poirier and João Monteiro and Oleh Shliazhko and Nicolas Gontier and Nicholas Meade and Armel Zebaze and Ming-Ho Yee and Logesh Kumar Umapathi and Jian Zhu and Benjamin Lipkin and Muhtasham Oblokulov and Zhiruo Wang and Rudra Murthy and Jason Stillerman and Siva Sankalp Patel and Dmitry Abulkhanov and Marco Zocca and Manan Dey and Zhihan Zhang and Nour Fahmy and Urvashi Bhattacharyya and Wenhao Yu and Swayam Singh and Sasha Luccioni and Paulo Villegas and Maxim Kunakov and Fedor Zhdanov and Manuel Romero and Tony Lee and Nadav Timor and Jennifer Ding and Claire Schlesinger and Hailey Schoelkopf and Jan Ebert and Tri Dao and Mayank Mishra and Alex Gu and Jennifer Robinson and Carolyn Jane Anderson and Brendan Dolan-Gavitt and Danish Contractor and Siva Reddy and Daniel Fried and Dzmitry Bahdanau and Yacine Jernite and Carlos Muñoz Ferrandis and Sean Hughes and Thomas Wolf and Arjun Guha and Leandro von Werra and Harm de Vries}, year={2023}, eprint={2305.06161}, archivePrefix={arXiv}, primaryClass={cs.CL}}