# Fine-tune Llama 2


### Installing and Importing Libraries

In [None]:
%%capture
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset, random_split, Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
os.getcwd()

'/content'

### Loading the Data

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Fine tuning Llama/train_data.csv', nrows = 1000)
test_df = pd.read_csv('/content/drive/MyDrive/Fine tuning Llama/test_data.csv')

### Configurations for the Model

In [None]:
# The model to be downloaded
model_name = "togethercomputer/Llama-2-7B-32K-Instruct"

# The instruction dataset to use
#dataset_name = "mlabonne/guanaco-llama2-1k"

# Location to store the New Model
new_model = "/content/drive/MyDrive/Fine tuning Llama"

In [None]:
### QLoRA parameters
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1


In [None]:
### bitsandbytes parameters
# Loading 8-bit precision base model
use_8bit = True

# setting Compute dtype for 4-bit base models
bnb_8bit_compute_dtype = "float16"

# Quantization type
bnb_4bit_quant_type = "nf4"

# nested quantization for 8-bit base models (double quantization)
use_nested_quant = False

In [None]:
### TrainingArguments parameters
# Output directory where the model predictions and checkpoints will be stored
output_dir = "/content/drive/MyDrive/Fine tuning Llama"

# Number of training epochs
num_train_epochs = 2

# Enable fp16/bf16 training
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing (for reducing the memory footprint)
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use (helps in efficient memory management betwwen CPU and GPU)
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

In [None]:
### SFT parameters
# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = "auto"

In [None]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,data
0,0,[INST]The output should be the aspects (both i...
1,1,[INST]The output should be the aspects (both i...
2,2,[INST]The output should be the aspects (both i...
3,3,[INST]The output should be the aspects (both i...
4,4,[INST]The output should be the aspects (both i...


### Loading Model and Tokenizer

In [None]:
# Load dataset (you can process it here)
#dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_8bit=use_8bit,
    bnb_8bit_quant_type=bnb_4bit_quant_type,
    bnb_8bit_compute_dtype=compute_dtype,
    bnb_8bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_8bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

In [None]:

# Set training parameters


training_arguments = TrainingArguments(
    do_train=True,
    do_eval=True,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    load_best_model_at_end=True,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    save_strategy='steps',
    evaluation_strategy='steps',
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaLinearScalingRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNo

In [None]:
from datasets import Dataset

In [None]:
#train_df, test_df
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
from transformers import EarlyStoppingCallback

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset = test_ds,
    peft_config=peft_config,
    dataset_text_field="data",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
        callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2)
    ]
)

In [None]:
trainer.train()

# Save trained model
#trainer.model.save_pretrained(new_model)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
25,2.9079,2.769474
50,2.5695,2.292127
75,2.0215,1.598883
100,1.1285,0.775409
125,0.6735,0.41632
150,0.2622,0.374963
175,0.5491,0.357123
200,0.2606,0.353067
225,0.4978,0.347144
250,0.2202,0.347342


TrainOutput(global_step=500, training_loss=0.7359602270126343, metrics={'train_runtime': 4994.6182, 'train_samples_per_second': 0.4, 'train_steps_per_second': 0.1, 'total_flos': 1.2444192907001856e+16, 'train_loss': 0.7359602270126343, 'epoch': 2.0})

In [None]:
## Saving the fine tuned Model
trainer.model.save_pretrained(new_model)

### Reloading the Fine tuned Model

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=0,
)
peft_model = PeftModel.from_pretrained(base_model, new_model)
peft_model = peft_model.merge_and_unload()

# Reload tokenizer to save it
peft_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
peft_tokenizer.pad_token = peft_tokenizer.eos_token
peft_tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Model Performance Evaluation

In [None]:
instructions = []
sols = []
for i in test_df['data']:
  instructions.append(i.split('[/INST]')[0] + '[/INST]')
  sols.append(i.split('[/INST]\n')[1].split('[')[0])

In [None]:
import torch

pipe = pipeline(task="text-generation", model=peft_model, tokenizer=peft_tokenizer, max_length=200, device=0)

def generate_text_batched(instructions):
  batch_size = min(len(instructions), 4)  # Adjust batch size for optimal efficiency
  batched_prompts = []
  for i in range(0, len(instructions), batch_size):
    batch = instructions[i:i+batch_size]
    batched_prompts.append([f"{p}" for p in batch])

  with torch.no_grad():  # Disable gradient calculation for inference
    response = []
    for batch in batched_prompts:
      results = pipe(batch, batch_size=batch_size)  # Leverage batching
      response.extend([result[0]['generated_text'] for result in results])
  return response


response = generate_text_batched(instructions)

# Access response as needed


In [None]:
response[0]

'[INST]The output should be the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.\nPositive example -\ninput: I charge it at night and skip taking the cord with me because of the good battery life.\noutput: battery life:positive, \nNeutral example -\ninput: Nightly my computer defrags itself and runs a virus scan.\noutput: virus scan:neutral\nNow complete the following example-\ninput:Boot time is super fast, around anywhere from 35 seconds to 1 minute.[/INST]output: boot time:positive[INST]The output should be the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.\nPositive example -\ninput: I charge it'

In [None]:
pd.DataFrame(response).to_csv('/content/drive/MyDrive/Fine tuning Llama/new_model/responses.csv')
pd.DataFrame(instructions).to_csv('/content/drive/MyDrive/Fine tuning Llama/new_model/instructions.csv')
pd.DataFrame(sols).to_csv('/content/drive/MyDrive/Fine tuning Llama/new_model/sols.csv')

In [None]:
short_response = []
for res in response:
  short_response.append(res.split('[/INST]')[1].split('[')[0])

In [None]:
cleaned_sols = [i.split('output:')[1].strip() for i in sols]
cleaned_short_response = [i.split('output:')[-1].strip() for i in short_response]

In [None]:
    def get_metrics(y_true, y_pred, is_triplet_extraction=False):
        total_pred = 0
        total_gt = 0
        tp = 0
        if not is_triplet_extraction:
            for gt, pred in zip(y_true, y_pred):
                gt_list = gt.split(', ')
                pred_list = pred.split(', ')
                total_pred+=len(pred_list)
                total_gt+=len(gt_list)
                for gt_val in gt_list:
                    for pred_val in pred_list:
                        if pred_val in gt_val or gt_val in pred_val:
                            #print(pred_val,"*", gt_val)
                            tp+=1
                            break

        else:
            for gt, pred in zip(y_true, y_pred):
                gt_list = gt.split(', ')
                pred_list = pred.split(', ')
                total_pred+=len(pred_list)
                total_gt+=len(gt_list)
                for gt_val in gt_list:
                    gt_asp = gt_val.split(':')[0]

                    try:
                        gt_op = gt_val.split(':')[1]
                    except:
                        continue

                    try:
                        gt_sent = gt_val.split(':')[2]
                    except:
                        continue

                    for pred_val in pred_list:
                        pr_asp = pred_val.split(':')[0]

                        try:
                            pr_op = pred_val.split(':')[1]
                        except:
                            continue

                        try:
                            pr_sent = gt_val.split(':')[2]
                        except:
                            continue

                        if pr_asp in gt_asp and pr_op in gt_op and gt_sent == pr_sent:
                            tp+=1

        p = tp/total_pred
        r = tp/total_gt
        return p, r, 2*p*r/(p+r), None

In [None]:
x = get_metrics(cleaned_sols, cleaned_short_response, is_triplet_extraction=False)

In [None]:
x

(0.4298540965207632, 0.37148399612027155, 0.39854318418314255, None)

In [None]:
instructions = []
sols = []
for i in train_df['data']:
  instructions.append(i.split('[/INST]')[0] + '[/INST]')
  sols.append(i.split('[/INST]\n')[1].split('[')[0])

In [None]:
len(instructions)

1000

In [None]:
import torch

pipe = pipeline(task="text-generation", model=peft_model, tokenizer=peft_tokenizer, max_length=200, device=0)

def generate_text_batched(instructions):
  batch_size = min(len(instructions), 4)  # Adjust batch size for optimal efficiency
  batched_prompts = []
  for i in range(0, len(instructions), batch_size):
    batch = instructions[i:i+batch_size]
    batched_prompts.append([f"{p}" for p in batch])

  with torch.no_grad():  # Disable gradient calculation for inference
    response = []
    for batch in batched_prompts:
      results = pipe(batch, batch_size=batch_size)  # Leverage batching
      response.extend([result[0]['generated_text'] for result in results])
  return response


response = generate_text_batched(instructions)

# Access response as needed


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Input length of input_ids is 209, but `max_length` is set to 200. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [None]:
response[0]

'[INST]The output should be the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.\nPositive example -\ninput: I charge it at night and skip taking the cord with me because of the good battery life.\noutput: battery life:positive, \nNeutral example -\ninput: Nightly my computer defrags itself and runs a virus scan.\noutput: virus scan:neutral\nNow complete the following example-\ninput:I charge it at night and skip taking the cord with me because of the good battery life.[/INST]output: battery life:positive[INST]The output should be the aspects (both implicit and explicit) and the aspects sentiment polarity only'

In [None]:
pd.DataFrame(response).to_csv('/content/drive/MyDrive/Fine tuning Llama/new_model/train_responses.csv')
pd.DataFrame(instructions).to_csv('/content/drive/MyDrive/Fine tuning Llama/new_model/train_instructions.csv')
pd.DataFrame(sols).to_csv('/content/drive/MyDrive/Fine tuning Llama/new_model/train_sols.csv')

In [None]:
short_response = []
for res in response:
  short_response.append(res.split('[/INST]')[1].split('[')[0])

In [None]:
cleaned_sols = [i.split('output:')[1].strip() for i in sols]
cleaned_short_response = [i.split('output:')[-1].strip() for i in short_response]

In [None]:
x = get_metrics(cleaned_sols, cleaned_short_response, is_triplet_extraction=False)

In [None]:
x

(0.5082417582417582, 0.4265949269792467, 0.4638529043042206, None)

In [None]:
response[0].split('[/INST]')[1].split('[')[0]

'output: boot time:positive'

In [None]:
s = '''
[INST]The output should be the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.
Positive example -
input: I charge it at night and skip taking the cord with me because of the good battery life.
output: battery life:positive,
Neutral example -
input: Nightly my computer defrags itself and runs a virus scan.
output: virus scan:neutral
Now complete the following example-
input:I liked the speaker, but didnt like the screen resolution![/INST]
'''

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = s
pipe = pipeline(task="text-generation", model=peft_model, tokenizer=peft_tokenizer, max_length=150, device=0)
result = pipe(f"{prompt}")
print(result[0]['generated_text'])


[INST]The output should be the aspects (both implicit and explicit) and the aspects sentiment polarity only. In cases where there are no aspects the output should be noaspectterm:none.
Positive example -
input: I charge it at night and skip taking the cord with me because of the good battery life.
output: battery life:positive,
Neutral example -
input: Nightly my computer defrags itself and runs a virus scan.
output: virus scan:neutral
Now complete the following example-
input:I liked the speaker, but didnt like the screen resolution![/INST]
output: speaker:positive,screen resolution:negative[INST]


In [None]:
s

'\n### User:\nDefinition: The output will be the aspects (both implicit and explicit) and the aspects sentiment polarity. In cases where there are no aspects the output should be noaspectterm:none.\nPositive example -\ninput: I charge it at night and skip taking the cord with me because of the good battery life.\noutput: battery life:positive, \nNeutral example -\ninput: Nightly my computer defrags itself and runs a virus scan.\noutput: virus scan:neutral\nNow complete the following example-\ninput:I charge it at night and skip taking the cord with me because of the good battery life.\n\n### Assistant:\n'

### Trying to visualse the change in the model weights

In [None]:
base_model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0060, -0.0146, -0.0021,  ...,  0.0042,  0.0018, -0.0035],
        [ 0.0142, -0.0043,  0.0032,  ..., -0.0092, -0.0108,  0.0073],
        [-0.0137,  0.0121,  0.0002,  ...,  0.0061,  0.0181, -0.0030],
        ...,
        [ 0.0018,  0.0093, -0.0006,  ...,  0.0092, -0.0289,  0.0085],
        [ 0.0249,  0.0116,  0.0035,  ..., -0.0322, -0.0165, -0.0111],
        [-0.0136, -0.0067,  0.0016,  ...,  0.0176,  0.0175, -0.0083]],
       device='cuda:0', dtype=torch.float16, requires_grad=True)

In [None]:
peft_model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0059, -0.0146, -0.0019,  ...,  0.0041,  0.0018, -0.0034],
        [ 0.0141, -0.0043,  0.0029,  ..., -0.0091, -0.0109,  0.0071],
        [-0.0136,  0.0121,  0.0003,  ...,  0.0061,  0.0181, -0.0029],
        ...,
        [ 0.0018,  0.0093, -0.0007,  ...,  0.0092, -0.0290,  0.0085],
        [ 0.0249,  0.0116,  0.0036,  ..., -0.0323, -0.0164, -0.0111],
        [-0.0136, -0.0067,  0.0015,  ...,  0.0176,  0.0174, -0.0083]],
       device='cuda:0', dtype=torch.float16)

In [None]:
base_model.model.layers[0].self_attn.q_proj.weight.mean()

tensor(-2.9802e-06, device='cuda:0', dtype=torch.float16,
       grad_fn=<MeanBackward0>)

In [None]:
peft_model.model.layers[0].self_attn.q_proj.weight.mean()

tensor(-2.9802e-06, device='cuda:0', dtype=torch.float16)

In [None]:
peft_model.model.layers[0].self_attn.q_proj.weight.shape

torch.Size([4096, 4096])

In [None]:
peft_model.model.layers[0].self_attn.k_proj.weight

Parameter containing:
tensor([[-0.0155,  0.0078, -0.0011,  ...,  0.0164, -0.0097, -0.0136],
        [ 0.0182,  0.0012,  0.0034,  ..., -0.0206,  0.0143,  0.0229],
        [-0.0245, -0.0220,  0.0018,  ...,  0.0150, -0.0157, -0.0110],
        ...,
        [ 0.0123, -0.0007, -0.0008,  ...,  0.0002,  0.0029,  0.0081],
        [-0.0050,  0.0171, -0.0031,  ..., -0.0033,  0.0112, -0.0110],
        [ 0.0036, -0.0023,  0.0012,  ...,  0.0073, -0.0114,  0.0095]],
       device='cuda:0', dtype=torch.float16)

In [None]:
base_model.model.layers[0].self_attn.k_proj.weight

Parameter containing:
tensor([[-0.0155,  0.0078, -0.0011,  ...,  0.0164, -0.0097, -0.0136],
        [ 0.0182,  0.0012,  0.0034,  ..., -0.0206,  0.0143,  0.0229],
        [-0.0245, -0.0220,  0.0018,  ...,  0.0150, -0.0157, -0.0110],
        ...,
        [ 0.0123, -0.0007, -0.0008,  ...,  0.0002,  0.0029,  0.0081],
        [-0.0050,  0.0171, -0.0031,  ..., -0.0033,  0.0112, -0.0110],
        [ 0.0036, -0.0023,  0.0012,  ...,  0.0073, -0.0114,  0.0095]],
       device='cuda:0', dtype=torch.float16, requires_grad=True)

In [None]:
peft_model.model.layers[0].self_attn.k_proj.weight.mean()

tensor(8.3447e-06, device='cuda:0', dtype=torch.float16)

In [None]:
base_model.model.layers[0].self_attn.k_proj.weight.mean()

tensor(8.3447e-06, device='cuda:0', dtype=torch.float16,
       grad_fn=<MeanBackward0>)

In [None]:
peft_model.model.layers[0].self_attn.v_proj.weight

Parameter containing:
tensor([[-8.4639e-05, -2.2964e-03,  2.6550e-03,  ...,  7.5874e-03,
         -1.0061e-03,  9.5673e-03],
        [-6.8436e-03, -5.0843e-05, -6.1226e-03,  ..., -1.1681e-02,
          1.2611e-02,  6.2485e-03],
        [ 8.6069e-04,  1.0262e-02,  1.5211e-03,  ...,  4.8103e-03,
         -1.3451e-02, -1.3618e-02],
        ...,
        [-6.3248e-03, -6.3438e-03,  1.0834e-02,  ...,  3.8376e-03,
          2.3727e-03, -1.8597e-03],
        [ 1.6203e-03,  5.7030e-03, -9.4032e-04,  ...,  6.7978e-03,
          1.5564e-02,  1.1623e-05],
        [-3.3760e-04,  1.4181e-03,  5.3864e-03,  ..., -2.6941e-04,
         -8.3876e-04,  1.5545e-03]], device='cuda:0', dtype=torch.float16)

In [None]:
base_model.model.layers[0].self_attn.v_proj.weight

Parameter containing:
tensor([[-3.1471e-05, -2.3346e-03,  2.6550e-03,  ...,  7.5684e-03,
         -9.7656e-04,  9.5215e-03],
        [-7.0190e-03,  6.7711e-05, -6.1035e-03,  ..., -1.1597e-02,
          1.2512e-02,  6.4087e-03],
        [ 7.8583e-04,  1.0315e-02,  1.5335e-03,  ...,  4.8523e-03,
         -1.3489e-02, -1.3550e-02],
        ...,
        [-6.5308e-03, -6.1951e-03,  1.0864e-02,  ...,  3.9368e-03,
          2.2583e-03, -1.6785e-03],
        [ 1.7395e-03,  5.6152e-03, -9.5749e-04,  ...,  6.7444e-03,
          1.5625e-02, -8.8692e-05],
        [-1.9264e-04,  1.3123e-03,  5.3711e-03,  ..., -3.3188e-04,
         -7.5531e-04,  1.4267e-03]], device='cuda:0', dtype=torch.float16,
       requires_grad=True)

In [None]:
base_model.model.layers[0].self_attn.v_proj.weight.mean()

tensor(1.7881e-07, device='cuda:0', dtype=torch.float16,
       grad_fn=<MeanBackward0>)

In [None]:
peft_model.model.layers[0].self_attn.v_proj.weight.mean()

tensor(1.7881e-07, device='cuda:0', dtype=torch.float16)

In [None]:
peft_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "3M supports the long-term goal of"
pipe = pipeline(task="text-generation", model=peft_model, tokenizer=peft_tokenizer, max_length=200, device=0)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] 3M supports the long-term goal of [/INST]  "3M supports the long-term goal of limiting global warming to 1.
5°C above pre-industrial levels and pursuing efforts to limit it to 1.
0°C, as set out in the Paris Agreement. We are committed to reducing our own greenhouse gas emissions in line with this goal, and to supporting our customers and partners in their efforts to reduce their emissions as well. We are also committed to supporting the transition to a low-carbon economy, and to ensuring that our products and services are aligned with the goals of the Paris Agreement.

To achieve these goals, we have set ourselves a number of targets, including:

* Reducing our own greenhouse gas emissions by 30% by 2030, compared to 2015 levels.



### before fine tuning

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=0,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "3M supports the long-term goal of"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] 3M supports the long-term goal of [/INST]  Here are some examples of 3M's support for long-term goals:

1. Sustainability: 3M has set a goal to reduce its greenhouse gas emissions by 30% by 2025 and to achieve net-zero emissions by 2050. The company is also committed to using 100% renewable energy in its operations.
2. Diversity and Inclusion: 3M has a long-term goal to increase diversity and inclusion in its workforce, with a target to have at least 30% of its global workforce comprised of women and underrepresented minorities by 2025.
3. Education: 3M has a long-term goal to support education and innovation through its 3M Foundation, with a focus on science


In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "3M supports the long-term goal of"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] 3M supports the long-term goal of [/INST]  reducing greenhouse gas emissions and mitigating climate change. everybody. 3M has set a long-term goal to reduce its own greenhouse gas emissions by 30% by 2025, relative to its 2019 baseline. The company is working towards this goal through a variety of measures, including:

1. Increasing energy efficiency: 3M is investing in energy-efficient technologies and processes to reduce energy consumption and lower emissions.
2. Renewable energy: 3M is exploring the use of renewable energy sources, such as wind and solar power, to reduce its reliance on fossil fuels and lower emissions.
3. Sustainable transportation: 3M is working to reduce its transportation-related emissions by investing in alternative modes of transport


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

pytorch_model.bin:   0%|          | 0.00/5.52G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DhirajKumarSahu/ClimateChangeLlama/commit/2ea6aa19986d885fb4f1db30db6fe8cdb88249c8', commit_message='Upload tokenizer', commit_description='', oid='2ea6aa19986d885fb4f1db30db6fe8cdb88249c8', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
m = AutoModelForSequenceClassification.from_pretrained('DhirajKumarSahu/ClimateChangeLlama')

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.52G [00:00<?, ?B/s]