In [1]:
%pip install transformers
%pip install datasets
%pip install peft
%pip install bitsandbytes
%pip install sentencepiece

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━

In [2]:
import pandas as pd
import os
import torch
from tqdm import tqdm
from datasets import Dataset
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
from transformers import pipeline
from transformers import BitsAndBytesConfig
import sentencepiece

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import warnings
warnings.filterwarnings("ignore")

Mounted at /content/drive


In [4]:
%cd drive/MyDrive/CS\ 182\ Final\ Project

/content/drive/.shortcut-targets-by-id/1hzhdcGA40OipfzF0SRT7omKvzCSi0q4r/CS 182 Final Project


In [5]:
def read_files(base_path: str, file_names: list):
  dfs = []
  for file in file_names:
    temp_df = pd.read_csv(f"./{base_path}/{file}")
    temp_df["class"] = file.split(".")[0]
    dfs.append(temp_df)
  df = pd.concat(dfs)
  df.columns = ["question", "answer", "class"]
  return df

test_df = read_files("Data Processing/Test data", ["126.csv", "127.csv", "182.csv", "189_fa.csv", "189_sp.csv"])
test_dataset = Dataset.from_pandas(test_df)
test_dataset

Dataset({
    features: ['question', 'answer', 'class', '__index_level_0__'],
    num_rows: 409
})

In [5]:
def get_test_ppl(model, tokenizer, dataset, exp_name, base_model=False, device='cuda'):

  nlls = []

  for example in tqdm(dataset):
    q = example['question']
    a = example['answer']

    if not base_model:
      q = '<s>[INST] ' + q + ' [/INST]'
      a = a + ' </s>'

    qa = q + ' ' + a
    q_encoding_length = tokenizer(q, return_tensors="pt").input_ids.shape[1]
    qa_encoding = tokenizer(qa, return_tensors='pt')
    input_ids = qa_encoding.input_ids
    target_ids = input_ids.clone()
    target_ids[:, :q_encoding_length] = -100

    with torch.no_grad():
      output = model(input_ids.to(device), labels=target_ids.to(device))
      nlls.append(output.loss.item())

  df = pd.DataFrame(dataset)
  df['nll'] = nlls
  df['ppl'] = torch.exp(torch.tensor(nlls))
  df = df[['class', 'question', 'answer', 'nll', 'ppl']]
  df.to_csv(f'./Data Processing/Test data/results/{exp_name}.csv', index=False)
  return df

### Experiment-specific arguments

In [10]:
ranks = [4, 8, 32, 64, 128, 256]
alphas = [16, 32, 64, 128, 256]
prefix = "Model/mir/"
runs = {}

for rank in ranks:
  runs[f"llama_r{rank}_a16_final"] = prefix + f"llama_lora_r{rank}_a16/llama_3b_v2_lora_r{rank}_a16"
  runs[f"llama_r{rank}_a16_epoch3"] = prefix + f"llama_lora_r{rank}_a16/results/checkpoint-350"

for alpha in alphas:
  runs[f"llama_r64_a{alpha}_final"] = prefix + f"llama_lora_r64_a{alpha}/llama_3b_v2_lora_r64_a{alpha}"
  runs[f"llama_r64_a{alpha}_epoch3"] = prefix + f"llama_lora_r64_a{alpha}/results/checkpoint-350"

In [6]:
device = "cuda"
base_model = True
base_model_id = "openlm-research/open_llama_3b_v2"

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/512k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
for run in runs:
  model = AutoPeftModelForCausalLM.from_pretrained(runs[run]).to(device)
  model.config.use_cache = False
  model.config.pretraining_tp = 1
  df = get_test_ppl(model, tokenizer, test_dataset, run, base_model, device)
  print(f"{run}: {df['ppl'].mean()}")
  del model

100%|██████████| 409/409 [00:51<00:00,  7.95it/s]


llama_r4_a16_final: 4.0657243728637695


100%|██████████| 409/409 [00:51<00:00,  7.95it/s]


llama_r4_a16_epoch3: 3.9118659496307373


100%|██████████| 409/409 [00:51<00:00,  7.95it/s]


llama_r8_a16_final: 4.068556308746338


100%|██████████| 409/409 [00:51<00:00,  7.94it/s]


llama_r8_a16_epoch3: 3.8862664699554443


100%|██████████| 409/409 [00:51<00:00,  7.92it/s]


llama_r32_a16_final: 4.086728096008301


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r32_a16_epoch3: 3.871002197265625


100%|██████████| 409/409 [00:51<00:00,  7.91it/s]


llama_r64_a16_final: 4.103688716888428


100%|██████████| 409/409 [00:51<00:00,  7.91it/s]


llama_r64_a16_epoch3: 3.870089054107666


100%|██████████| 409/409 [00:52<00:00,  7.86it/s]


llama_r128_a16_final: 4.118011474609375


100%|██████████| 409/409 [00:52<00:00,  7.85it/s]


llama_r128_a16_epoch3: 3.8634374141693115


100%|██████████| 409/409 [00:52<00:00,  7.79it/s]


llama_r256_a16_final: 4.14124059677124


100%|██████████| 409/409 [00:52<00:00,  7.79it/s]


llama_r256_a16_epoch3: 3.8750946521759033


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r64_a32_final: 4.27125883102417


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r64_a32_epoch3: 3.841212272644043


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r64_a64_final: 4.567688941955566


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r64_a64_epoch3: 3.799267053604126


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r64_a128_final: 5.231405735015869


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r64_a128_epoch3: 3.9334795475006104


100%|██████████| 409/409 [00:51<00:00,  7.93it/s]


llama_r64_a256_final: 5.6819376945495605


100%|██████████| 409/409 [00:51<00:00,  7.92it/s]

llama_r64_a256_epoch3: 4.062955856323242





In [13]:
# base model
model = AutoModelForCausalLM.from_pretrained("openlm-research/open_llama_3b_v2").to(device)
model.config.use_cache = False
model.config.pretraining_tp = 1
df = get_test_ppl(model, tokenizer, test_dataset, "llama_base", base_model=True, device=device)
print(f"llama_base: {df['ppl'].mean()}")

100%|██████████| 409/409 [00:50<00:00,  8.10it/s]

llama_base: 4.78560209274292





### Generate responses

In [7]:
humaneval_df = read_files("Data Processing/Human Eval Data", ["126.csv", "127.csv", "182.csv", "189_fa.csv", "189_sp.csv"])
humaneval_df

Unnamed: 0,question,answer,class
0,Explain why entropy is always a non-negative q...,Entropy is non-negative because it is defined ...,126
1,Explain the Gram-Schmidt process and how it is...,The Gram-Schmidt process is an algorithm used ...,126
2,How do you compute the expected time until all...,To compute the expected time until all 20 ligh...,126
3,How is the joint probability mass function (PM...,"For discrete random variables $X$ and $Y$, the...",126
4,Summarize the relationship between reversibili...,"Within the context of Markov chains, a reversi...",126
...,...,...,...
6,What is the relationship between the primal an...,"In Kernel Ridge Regression, the primal weight ...",189_sp
7,Explain the concept of Kernel Perceptrons and ...,"In a Kernel Perceptron algorithm, the weights ...",189_sp
8,Explain the concept of using nonlinearity in n...,Nonlinearity in neural networks is introduced ...,189_sp
9,Explain why the naive computation of gradients...,The naive computation of gradients is ineffici...,189_sp


In [23]:
# fine-tuned model
inputs_tuned = humaneval_df['question'].apply(lambda x: f"<s>[INST] {x} [/INST] ").to_list()
model = AutoPeftModelForCausalLM.from_pretrained("Model/mir/llama_lora_r64_a64/results/checkpoint-350").to(device)
pipe_tuned = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, device=device)
results_tuned = pipe_tuned(inputs_tuned)
results_tuned = [result[0]['generated_text'] for result in results_tuned]
humaneval_df_tuned = humaneval_df.copy()
humaneval_df_tuned['output'] = results_tuned
humaneval_df_tuned.to_csv("Data Processing/Human Eval Data/results/llama_lora_r64_a64.csv", index=False)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

In [24]:
# base model
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

inputs_base = humaneval_df['question'].apply(lambda x: f"Question: {x} Answer: ").to_list()
model = AutoModelForCausalLM.from_pretrained("gpt2-medium").to(device)
pipe_base = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200, device=device)
# results_base = pipe_base(inputs_base)
# results_base = [result[0]['generated_text'] for result in results_base]
# humaneval_df_base = humaneval_df.copy()
# humaneval_df_base['output'] = results_base
# humaneval_df_base.to_csv("Data Processing/Human Eval Data/results/llama_base.csv", index=False)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [30]:
r = pipe_base(["Question: What is the capital of the United States? Answer: "])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [31]:
r[0][0]["generated_text"]

'Question: What is the capital of the United States? Answer: ____.\n\nQuestion: What is the capital of Pennsylvania? Answer: ______.\n\nQuestion: What is the capital of Virginia? Answer: ____.\n\nQuestion: When has a child become a baby? Answer: 1 month, 3 months, 5 months, 8 months, 12 months, 14 months, 17 months, 18 months, _____________ years.\n\nQuestion: Which country is not a nation? Answer: ____.\n\nQuestion: What the dollar value of gold is? Answer: _____ dollars.\n\nQuestion: Are all women entitled to equal protection from sexual harassment? Answer: ______________________________________________________________________.\n\nQuestion: What happened to the name, "Marin County?" Answer: _____________________________________________________________________.\n\nQuestion: What is the total number of miles of the Alaskan Way Viaduct? Answer: 764 miles (or, in English, 8,'