In [None]:
import pickle
from dotenv import load_dotenv
import transformers
import torch
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [None]:
import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
def loadData(file):
    dbfile = open(file, 'rb')
    db = pickle.load(dbfile)

    return db

In [None]:
dataset = loadData('LangDatasetBetter.pickle')

In [None]:
dataset.keys()

In [None]:
dataset['google_genai']

In [None]:
import re

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
max_tokens = 4096

def tokenize_len(text):
    return len(tokenizer.tokenize(text))

def split_by_class_sections(text):
    pattern = r'\n{2,}## Class Objects: (.+?)\n'
    parts = re.split(pattern, text)
    it = iter(parts[1:]) 
    return list(zip(it, it))  

final_chunks = []

for i, (topic, full_doc) in enumerate(dataset.items(), start=1):
    sections = split_by_class_sections(full_doc)
    for j, (module, content) in enumerate(sections, start=1):
        header = f"### Instruction: Learn about the {topic} LangChain API.\n\n### Part {i} - Module:{module}(chunk{j})\n\n"
        full_text = header + content.strip()
        if tokenize_len(full_text) <= max_tokens:
            final_chunks.append({"text": full_text})
        else:
            words = full_text.split()
            chunk = ""
            for word in words:
                chunk += word + " "
                if tokenize_len(chunk) >= 3000:
                    final_chunks.append({"text": chunk.strip()})
                    chunk = ""
            if chunk:
                final_chunks.append({"text": chunk.strip()})


In [None]:
print(len(final_chunks))
print(final_chunks[80])

In [None]:
data= open('LangDatasetChunked.pickle', 'wb')
pickle.dump(final_chunks, data)
data.close()

In [None]:
from datasets import Dataset

documentation = Dataset.from_list(final_chunks)

In [None]:
model_id = "google/gemma-2-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False,
    attn_implementation='eager',
    use_cache=False,
)

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=['q_proj', "o_proj", "k_proj", "v_proj", 'gate_proj', 'up_proj', "down_proj"],
    task_type='CAUSAL_LM',

)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=4096)

tokenized_dataset = documentation.map(tokenize, batched=True)

In [None]:
def print_trainable_parameters(model):
    trainable = 0
    total = 0
    for param in model.parameters():
        num_params = param.numel()
        total += num_params
        if param.requires_grad:
            trainable += num_params
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Trainable ratio: {100 * trainable / total:.4f}%")

print_trainable_parameters(model)

In [None]:
tuner = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        learning_rate=5e-5,
        warmup_steps=50,
        logging_steps=5,
        fp16=True,
        optim="paged_adamw_8bit",
        gradient_checkpointing=True,
        output_dir="outputs2"
    ),
    peft_config=lora_config
)


In [None]:
os.environ['WANDB_DISABLED'] = "false"

In [None]:
import wandb

wandb.init(project="lang-tuner")

In [None]:
tuner.train()

In [None]:
import torch

text = " What is langchain_google_genai"
device = 'cuda:0'

inputs = tokenizer(text, return_tensors="pt")

for k, v in inputs.items():
    if v.dtype == torch.float:
        inputs[k] = v.half().to(device)
    else:
        inputs[k] = v.to(device)
with torch.amp.autocast('cuda'):
    outputs = model.generate(**inputs, max_new_tokens=300)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
model.save_pretrained("fine-tuned-gemma")
tokenizer.save_pretrained("fine-tuned-gemma")

In [None]:
from huggingface_hub import HfApi
api = HfApi()

for file in os.listdir(r'/content/fine-tuned-gemma'):
  api.upload_file(path_or_fileobj=f"fine-tuned-gemma/{file}", path_in_repo=f"{file}", repo_id="Prince-Dastan/gemma-2-2b-langchain-finetuned", repo_type="model",token=os.environ['HF_TOKEN'])