In [None]:
# !pip install -U torch datasets peft bitsandbytes trl wandb git+https://github.com/huggingface/transformers

In [1]:
import json
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer

In [5]:
secret_value_0 = "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"
secret_value_1 = "a43c62a469d697c9f9421f82431442a901734b5c"

In [6]:
!git config --global credential.helper store

!huggingface-cli login --token $secret_value_0 --add-to-git-credential

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


In [7]:
import wandb
wandb.login(key = secret_value_1)
run = wandb.init(
    project='llm-Apr162',
    job_type="training",
    anonymous="allow"
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mankit-hinge[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ubuntu/.netrc


In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [9]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        quantization_config=bnb_config, 
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    padding_side = "right",
    add_eos_token = True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [11]:
dataset = load_dataset("hingeankit/train_v4", split="train")
datasettokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [12]:
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,Question,Answer
0,What is Trichoderma and how does it help in ri...,Trichoderma is a genus of fungi that is benefi...
1,How should Trichoderma be used for treating ri...,Trichoderma should be used at the rate of 125 ...
2,What are the main diseases in rice that Tricho...,Trichoderma helps in controlling several key d...
3,"Can Trichoderma be used directly on soil, or i...",The advisory specifically mentions using Trich...
4,What should I do before sowing paddy seeds to ...,"Before sowing, treat the paddy seeds with 4gm ..."
5,Are there any precautions I should take when u...,"Yes, always contact local agricultural extensi..."
6,Is there a biological method to control bacter...,"Yes, as a biological method, you can soak the ..."
7,Which is more effective in controlling bacteri...,Both methods are effective in controlling bact...
8,How should I handle chemicals safely when trea...,"Handle chemicals with care to prevent spills, ..."
9,What causes the whitening of paddy crops in th...,The whitening of paddy crops can be caused by ...


In [13]:
def generate_prompt(sample):
    full_prompt =f"""<s>[INST] {sample['Question']} [/INST] {sample['Answer']} </s>"""
    return {"text": full_prompt}

In [14]:
dataset[0]

{'Question': 'What is Trichoderma and how does it help in rice cultivation?',
 'Answer': 'Trichoderma is a genus of fungi that is beneficial for rice crops as it has an effect on controlling soil-borne pathogens such as Rhizoctonia solani, which causes rice sheath blight, and Sclerotium oryzae, which causes Sclerotium rot. It also indirectly helps in controlling Fusarium spp., making it a valuable tool for managing diseases in rice cultivation.'}

In [15]:
generate_prompt(dataset[0])

{'text': '<s>[INST] What is Trichoderma and how does it help in rice cultivation? [/INST] Trichoderma is a genus of fungi that is beneficial for rice crops as it has an effect on controlling soil-borne pathogens such as Rhizoctonia solani, which causes rice sheath blight, and Sclerotium oryzae, which causes Sclerotium rot. It also indirectly helps in controlling Fusarium spp., making it a valuable tool for managing diseases in rice cultivation. </s>'}

In [16]:
generated_train_dataset = dataset.map(
    generate_prompt, remove_columns=list(dataset.features))

In [17]:
generated_train_dataset

Dataset({
    features: ['text'],
    num_rows: 2427
})

In [18]:
from peft import prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [19]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [20]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [21]:
modules = find_all_linear_names(model)
print(modules)

['k_proj', 'q_proj', 'o_proj', 'down_proj', 'up_proj', 'v_proj', 'gate_proj']


In [22]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

In [23]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [24]:
from peft import get_peft_model

model = get_peft_model(model, lora_config)

print_trainable_parameters(model)

trainable params: 170082304 || all params: 3922153472 || trainable%: 4.336452033664837


In [25]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir="./e4_finetune",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="steps",
    # save_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    do_eval=True,
    report_to="wandb",
    resume_from_checkpoint=True
    )

In [26]:
import transformers

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=generated_train_dataset,
    # eval_dataset=generated_val_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=None,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [27]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



Step,Training Loss
10,2.3525
20,1.4366
30,1.2691
40,1.3766
50,1.3744
60,1.3216
70,1.3492
80,1.1835
90,1.0748
100,1.3642




TrainOutput(global_step=607, training_loss=1.149848144568957, metrics={'train_runtime': 4296.5985, 'train_samples_per_second': 0.565, 'train_steps_per_second': 0.141, 'total_flos': 1.20044443846656e+16, 'train_loss': 1.149848144568957, 'epoch': 1.0})

In [28]:
new_model = "adapter-e2Apr162"

In [29]:

trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▇█▇▄▃▄▅▁▄▃▄▄▄▃▄▃▃▁▃▃▅▂▆▃▂▅▃▄▁▂▃▃▂▂▂▃▄▃▃▁
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▄▃▃▃▃▃▂▂▃▃▃▂▃▂▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▂▂▂▂▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,1.0
train/global_step,607.0
train/grad_norm,1.55644
train/learning_rate,0.0
train/loss,0.8447
train/total_flos,1.20044443846656e+16
train/train_loss,1.14985
train/train_runtime,4296.5985
train/train_samples_per_second,0.565
train/train_steps_per_second,0.141


In [30]:
trainer.model.push_to_hub(new_model)

adapter_model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hingeankit/adapter-e2Apr162/commit/9b2d1680689704e94ee898e8d1ea2fb1edd63d9f', commit_message='Upload model', commit_description='', oid='9b2d1680689704e94ee898e8d1ea2fb1edd63d9f', pr_url=None, pr_revision=None, pr_num=None)

In [41]:
merged_model= PeftModel.from_pretrained(model, new_model)


In [44]:
def get_completion_merged(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """<s>[INST] {query} [/INST] </s>"""
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)

  generated_ids = merged_model.generate(**model_inputs, max_new_tokens=256, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

In [47]:
result = get_completion_merged(query="What treatment should be used for paddy seeds to prevent Bacterial Leaf Blight?", model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s><s> [INST] What treatment should be used for paddy seeds to prevent Bacterial Leaf Blight? [/INST] </s></s>imanaging Bacterial Leaf Blight (BLB) in paddy seeds predominantly involves preventive measures during the growing season. However, since you have asked about the treatment for paddy seeds specifically, I will provide some information on seed treatments that can help reduce the risk of BLB infection.

Seed treatments are not a common practice for BLB management in paddy fields, as the disease typically spreads through rice foliage and water in the environment. However, some seed treatments have shown promise in reducing the initial infection. Here are some seed treatments that can be considered:

1. Seed dressing with antibiotics: Seeds can be treated with antibiotics like Streptomycin, Oxytetracycline, or Tetracycline to provide protection against bacterial infections, including BLB. However, using antibiotics as seed treatments for rice is only effective if the seedling emerg