In [5]:
import pickle
from dotenv import load_dotenv
import transformers
import torch
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [9]:
import os
from google.colab import userdata

os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [10]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


In [23]:
def loadData(file):
    dbfile = open(file, 'rb')
    db = pickle.load(dbfile)

    return db

In [24]:
dataset = loadData('LangDataset')

In [25]:
dataset.keys()

dict_keys(['langchain_python_api_reference', 'langchain-ibm:_0.3.11', 'langchain-exa:_0.2.1', 'langchain-nomic:_0.1.4', 'langchain-weaviate:_0.0.4', 'pydata-sphinx-theme.js?digest=8878045cc6db502f8baf', 'pydata-sphinx-theme.css?digest=8878045cc6db502f8baf', 'langchain-tests:_0.3.19', 'search_-', 'langchain-google-community:_2.0.7', 'search', 'google_speech_to_text', 'pygments.css?v=8f2a1f02', 'langchain-neo4j:_0.4.0', 'langchain-groq:_0.3.2', 'langchain-anthropic:_0.3.13', 'langchain-redis:_0.2.1', 'vertex_ai_search', 'documentai_warehouse', 'gmail', 'langchain-huggingface:_0.2.0', 'langchain-ai21:_1.1.0', 'langchain-mistralai:_0.2.10', 'googlesearchrun', 'langchain-upstage:_0.6.0', 'custom.css?v=8e9fa5b3', 'langchain-elasticsearch:_0.3.2', 'bq_storage_vectorstores', 'langchain-azure-dynamic-sessions:_0.2.0', 'langchain:_0.3.25', 'langchain-chroma:_0.2.4', 'langchain-community:_0.3.24', 'langchain-xai:_0.2.3', 'langchain-ollama:_0.3.3', 'langchain-sqlserver:_0.1.2', 'vertex_check_groun

In [26]:
from datasets import Dataset

data = [{"text" : f'Topic:\n{k}\n\n Code and Description:\n{v}'} for k, v in dataset.items()]

dataset = Dataset.from_list(data)


In [11]:
model_id = "google/gemma-2-2b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [34]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=False,
    attn_implementation='eager',
    use_cache=False,
)

You have set `use_cache` to `False`, but cache_implementation is set to hybrid. cache_implementation will have no effect.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [39]:
def print_trainable_parameters(model):
    trainable = 0
    total = 0
    for param in model.parameters():
        num_params = param.numel()
        total += num_params
        if param.requires_grad:
            trainable += num_params
    print(f"Trainable parameters: {trainable:,}")
    print(f"Total parameters: {total:,}")
    print(f"Trainable ratio: {100 * trainable / total:.4f}%")

print_trainable_parameters(model)

Trainable parameters: 10,383,360
Total parameters: 1,612,587,264
Trainable ratio: 0.6439%


In [17]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=['q_proj', "o_proj", "k_proj", "v_proj", 'gate_proj', 'up_proj', "down_proj"],
    task_type='CAUSAL_LM',

)

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=4096)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/4938 [00:00<?, ? examples/s]

In [28]:
def formatting_func():
    pass

In [35]:
tuner = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=50,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        gradient_checkpointing=True,
    ),
    peft_config=lora_config)

Truncating train dataset:   0%|          | 0/4938 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [32]:
os.environ['WANDB_DISABLED'] = "false"

In [37]:
import wandb

wandb.init(project="lang-tuner")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprincedastan[0m ([33mprincedastan-mbm-university-jodhpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [38]:
tuner.train()

Step,Training Loss
1,14.7026
2,27.413
3,25.2057
4,27.413
5,27.413
6,5.9602
7,3.2481
8,3.5607
9,2.3732
10,3.1404


TrainOutput(global_step=50, training_loss=3.9109704804420473, metrics={'train_runtime': 392.2581, 'train_samples_per_second': 0.51, 'train_steps_per_second': 0.127, 'total_flos': 2500486653542400.0, 'train_loss': 3.9109704804420473})

In [44]:
import torch

text = " What are TextSplitters in langchain"
device = 'cuda:0'

inputs = tokenizer(text, return_tensors="pt")

for k, v in inputs.items():
    if v.dtype == torch.float:
        inputs[k] = v.half().to(device)
    else:
        inputs[k] = v.to(device)
with torch.cuda.amp.autocast():
    outputs = model.generate(**inputs, max_new_tokens=200)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


  with torch.cuda.amp.autocast():


 What are TextSplitters in langchain?

TextSplitters are a type of transformer in LangChain that can be used to split a text into smaller pieces. This can be useful for tasks such as tokenization, segmentation, or chunking.

TextSplitters can be used in a variety of ways, such as:

* Tokenizing a text into individual words or tokens.
* Segmenting a text into paragraphs or sentences.
* Chunking a text into smaller pieces, such as paragraphs or sentences.

TextSplitters can be created using the LangChain library, which provides a variety of tools for working with text.

Here is an example of how to use a TextSplitter to split a text into individual words:

import langchain3
from langchain3.transformers import TextSplitter

text = "This is a sample text."

splitter = TextSplitter()
words = splitter.split(text)

print(words)
# ['This', 'is', 'a', 'sample',
