In [3]:
import pickle
from dotenv import load_dotenv
import transformers
import torch
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [4]:
import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [5]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


In [6]:
def loadData(file):
    dbfile = open(file, 'rb')
    db = pickle.load(dbfile)

    return db

In [7]:
dataset = loadData('LangDatasetBetter.pickle')

In [8]:
dataset.keys()

dict_keys(['Langchain', 'perplexity', 'google_genai', 'ollama', 'chroma', 'pinecone', 'reference.html', 'nvidia_ai_endpoints', 'upstage', 'aws', 'anthropic', 'fireworks', 'cerebras', 'sqlserver', 'redis', '_modules', 'prompty', 'text_splitters', 'standard_tests', 'mistralai', 'mongodb', 'together', 'groq', 'cohere', 'experimental', 'nomic', 'openai', 'azure_dynamic_sessions', 'postgres', 'milvus', 'snowflake', 'neo4j', 'xai', 'unstructured', 'qdrant', 'tavily', 'astradb', 'community', 'ibm', 'core', 'google_vertexai', 'azure_ai', 'huggingface', 'elasticsearch', 'google_community', 'langchain', 'weaviate', 'ai21', 'deepseek', 'exa', 'voyageai', 'sema4'])

In [12]:
dataset['google_genai']

'\n\n## Class Objects: index\nlangchain-google-genai: 2.1.4\n\nlangchain-google-genai: 2.1.4#\nLangChain Google Generative AI Integration\nThis module integrates Google’s Generative AI models, specifically the Gemini series, with the LangChain framework. It provides classes for interacting with chat models and generating embeddings, leveraging Google’s advanced AI capabilities.\nChat Models\nThe ChatGoogleGenerativeAI class is the primary interface for interacting with Google’s Gemini chat models. It allows users to send and receive messages using a specified Gemini model, suitable for various conversational AI applications.\nLLMs\nThe GoogleGenerativeAI class is the primary interface for interacting with Google’s Gemini LLMs. It allows users to generate text using a specified Gemini model.\nEmbeddings\nThe GoogleGenerativeAIEmbeddings class provides functionalities to generate embeddings using Google’s models.\nThese embeddings can be used for a range of NLP tasks, including semantic 

In [14]:
import re

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
max_tokens = 4096

def tokenize_len(text):
    return len(tokenizer.tokenize(text))

def split_by_class_sections(text):
    pattern = r'\n{2,}## Class Objects: (.+?)\n'
    parts = re.split(pattern, text)
    # parts = ['', first_module, first_text, second_module, second_text, ...]
    it = iter(parts[1:])  # Skip first empty string
    return list(zip(it, it))  # [(module1, text1), (module2, text2), ...]

final_chunks = []

for i, (topic, full_doc) in enumerate(dataset.items(), start=1):
    sections = split_by_class_sections(full_doc)
    for j, (module, content) in enumerate(sections, start=1):
        header = f"### Instruction: Learn about the {topic} LangChain API.\n\n### Part {i} - Module:{module}(chunk{j})\n\n"
        full_text = header + content.strip()
        if tokenize_len(full_text) <= max_tokens:
            final_chunks.append({"text": full_text})
        else:
            words = full_text.split()
            chunk = ""
            for word in words:
                chunk += word + " "
                if tokenize_len(chunk) >= 3000:
                    final_chunks.append({"text": chunk.strip()})
                    chunk = ""
            if chunk:
                final_chunks.append({"text": chunk.strip()})


In [22]:
print(len(final_chunks))
print(final_chunks[80])

1143
{'text': '### Instruction: Learn about the nvidia_ai_endpoints LangChain API.\n\n### Part 8 - Module:langchain_nvidia_ai_endpoints.callbacks.get_usage_callback(chunk11)\n\nlangchain-nvidia-ai-endpoints: 0.3.10\ncallbacks\nget_usage_callback\n\nget_usage_callback#\n\nlangchain_nvidia_ai_endpoints.callbacks.get_usage_callback(\n\nprice_map: dict = {},\ncallback: UsageCallbackHandler | None = None,\n\n) → Generator[UsageCallbackHandler, None, None][source]#\nGet the OpenAI callback handler in a context manager.\nwhich conveniently exposes token and cost information.\n\nReturns:\nThe OpenAI callback handler.\n\nReturn type:\nOpenAICallbackHandler\n\nParameters:\n\nprice_map (dict)\ncallback (UsageCallbackHandler | None)\n\nExample\n>>> with get_openai_callback() as cb:\n...     # Use the OpenAI callback handler\n\n On this page\n  \n\nget_usage_callback()'}


In [40]:
data= open('LangDatasetChunked.pickle', 'wb')
pickle.dump(final_chunks, data)
data.close()

In [38]:
from datasets import Dataset

documentation = Dataset.from_list(final_chunks)

In [36]:
model_id = "google/gemma-2-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [37]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False,
    attn_implementation='eager',
    use_cache=False,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=['q_proj', "o_proj", "k_proj", "v_proj", 'gate_proj', 'up_proj', "down_proj"],
    task_type='CAUSAL_LM',

)

In [41]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=4096)

tokenized_dataset = documentation.map(tokenize, batched=True)

Map:   0%|          | 0/1143 [00:00<?, ? examples/s]

In [25]:
def print_trainable_parameters(model):
    trainable = 0
    total = 0
    for param in model.parameters():
        num_params = param.numel()
        total += num_params
        if param.requires_grad:
            trainable += num_params
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Trainable ratio: {100 * trainable / total:.4f}%")

print_trainable_parameters(model)

Trainable parameters: 590,065,920
Total parameters: 1,602,203,904
Trainable ratio: 36.8284%


In [42]:
tuner = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        learning_rate=5e-5,
        warmup_steps=50,
        logging_steps=5,
        fp16=True,
        optim="paged_adamw_8bit",
        gradient_checkpointing=True,
        output_dir="outputs2"
    ),
    peft_config=lora_config
)


Truncating train dataset:   0%|          | 0/1143 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [27]:
os.environ['WANDB_DISABLED'] = "false"

In [28]:
import wandb

wandb.init(project="lang-tuner")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprincedastan[0m ([33mprincedastan-mbm-university-jodhpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [43]:
tuner.train()

Step,Training Loss
5,27.2912
10,23.5623
15,13.4024
20,4.6116
25,3.9863
30,3.3991
35,2.8471
40,2.2377
45,1.5913
50,1.1079


TrainOutput(global_step=213, training_loss=2.2850887517973852, metrics={'train_runtime': 6157.4986, 'train_samples_per_second': 0.557, 'train_steps_per_second': 0.035, 'total_flos': 4.238324877754368e+16, 'train_loss': 2.2850887517973852})

In [61]:
import torch

text = " What is langchain_google_genai"
device = 'cuda:0'

inputs = tokenizer(text, return_tensors="pt")

for k, v in inputs.items():
    if v.dtype == torch.float:
        inputs[k] = v.half().to(device)
    else:
        inputs[k] = v.to(device)
with torch.amp.autocast('cuda'):
    outputs = model.generate(**inputs, max_new_tokens=300)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


 What is langchain_google_genai?

langchain_google_genai is a library for building large language models (LLMs) using the Google Cloud AI Platform. It provides a simple and easy-to-use interface for training and deploying LLMs, as well as a set of tools for managing and monitoring your LLMs.

What are the benefits of using langchain_google_genai?

langchain_google_genai offers a number of benefits, including:

* Easy to use: langchain_google_genai provides a simple and easy-to-use interface for training and deploying LLMs, making it accessible to developers of all skill levels.
* Scalable: langchain_google_genai can be scaled to handle large amounts of data and complex models, making it ideal for use in production environments.
* Cost-effective: langchain_google_genai is a cost-effective solution for training and deploying LLMs, as it uses the Google Cloud AI Platform to provide a low-cost, high-performance environment for training and deploying LLMs.
* Flexible: langchain_google_genai

In [46]:
model.save_pretrained("fine-tuned-gemma")
tokenizer.save_pretrained("fine-tuned-gemma")

('fine-tuned-gemma/tokenizer_config.json',
 'fine-tuned-gemma/special_tokens_map.json',
 'fine-tuned-gemma/tokenizer.model',
 'fine-tuned-gemma/added_tokens.json',
 'fine-tuned-gemma/tokenizer.json')

In [55]:
from huggingface_hub import HfApi
api = HfApi()

for file in os.listdir(r'/content/fine-tuned-gemma'):
  api.upload_file(path_or_fileobj=f"fine-tuned-gemma/{file}", path_in_repo=f"{file}", repo_id="Prince-Dastan/gemma-2-2b-langchain-finetuned", repo_type="model",token=os.environ['HF_TOKEN'])

model.safetensors:   0%|          | 0.00/3.54G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
