In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset
trainPath="/content/drive/MyDrive/Data.json"  #GIVE YOUR FILE PATH HERE
dataset = load_dataset("json", data_files= str(trainPath), split = "train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

alpaca_prompt =  """नीचे एक निर्देश है जो एक कार्य का वर्णन करता है, इसके साथ एक इनपुट है जो आगे संदर्भ प्रदान करता है। एक प्रतिक्रिया लिखें जो अनुरोध को उपयुक्त रूप से पूरा करती है।

### इनपुट:
{}

### प्रतिक्रिया:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["question"]
    outputs      = examples["answer"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    #print(texts)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

ModuleNotFoundError: No module named 'datasets'

In [None]:
print(dataset)

Dataset({
    features: ['question', 'answer', 'an,swer', 'text'],
    num_rows: 3998
})


In [None]:
dataset['text']

['नीचे एक निर्देश है जो एक कार्य का वर्णन करता है, इसके साथ एक इनपुट है जो आगे संदर्भ प्रदान करता है। एक प्रतिक्रिया लिखें जो अनुरोध को उपयुक्त रूप से पूरा करती है।\n\n### इनपुट:\nजीवों के लिए जनन क्यों अनिवार्य है?\n\n### प्रतिक्रिया:\nजनन जीवों का एक अति महत्त्वपूर्ण लक्षण है। यह एक अति आवश्यक जैविक प्रक्रिया है। जिसके द्वारा न सिर्फ जीवों की उत्तरजीविता में मदद मिलती है बल्कि इससे जीव-जाति की निरन्तरता भी बनी रहती है। जनन जीवों के अमरत्व में भी सहायक होता है। प्राकृतिक मृत्यु, वयता वे जीर्णता के कारण होने वाले जीव ह्रास की आपूर्ति, जनन द्वारा ही होती है। जनने से जीवों की संख्या बढ़ती है। जनन एक ऐसा माध्यम है जिसके द्वारा लाभदायक विभिन्नताएँ एक पीढ़ी से दूसरी पीढ़ी तक स्थानान्तरित होती हैं। अत: जनन जैव विकास में भी सहायक होता है। इन समस्त कारणों के आधार पर कहा जा सकता है कि जनन जीवों के लिए अनिवार्य है। <|end_of_text|>',
 'नीचे एक निर्देश है जो एक कार्य का वर्णन करता है, इसके साथ एक इनपुट है जो आगे संदर्भ प्रदान करता है। एक प्रतिक्रिया लिखें जो अनुरोध को उपयुक्त रूप से पूरा करती है।\

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps =500 ,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/3998 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,998 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 500
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbhilareaditi19[0m ([33mbhilareaditi19-centre-for-development-of-advanced-comput[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.1867
2,1.2921
3,1.3136
4,1.2658
5,1.0638
6,0.981
7,0.697
8,0.7937
9,0.5004
10,0.92


Inference

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "जीवों के लिए जनन क्यों अनिवार्य है?", # instruction
        "जीवों के लिए जनन क्यों अनिवार्य है?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>नीचे एक निर्देश है जो एक कार्य का वर्णन करता है, इसके साथ एक इनपुट है जो आगे संदर्भ प्रदान करता है। एक प्रतिक्रिया लिखें जो अनुरोध को उपयुक्त रूप से पूरा करती है।\n\n### इनपुट:\nजीवों के लिए जनन क्यों अनिवार्य है?\n\n### प्रतिक्रिया:\nजीवों के लिए जनन क्यों अनिवार्य है? जनन का अर्थ है प्रजनन, जो जीवों के जीवन चक्र में एक महत्वपूर्ण चरण है। जनन के माध्यम से, जीव अपने जीन को अगली पीढ़ी में स्थानांतर']

In [None]:
model.push_to_hub_merged("hindi_lora_chat_model_Biology", tokenizer, save_method = "merged_16bit", token = "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")

NameError: name 'model' is not defined

In [None]:
model.save_pretrained_gguf("project",tokenizer,quantization_method="f16")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.9 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 53%|█████▎    | 17/32 [00:02<00:01, 10.48it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:25<00:00,  2.66s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving project/pytorch_model-00001-of-00004.bin...
Unsloth: Saving project/pytorch_model-00002-of-00004.bin...
Unsloth: Saving project/pytorch_model-00003-of-00004.bin...
Unsloth: Saving project/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at project into f16 GGUF format.
The output location will be /content/project/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: project
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.

In [None]:
model.push_to_hub_gguf("hindi_gguf_Chat_model_Biology", tokenizer, token = "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")

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.11.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.3 (from gradio)
  Downloading gradio_client-1.5.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [None]:
pip install gradio

Collecting gradio
  Downloading gradio-5.11.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.3 (from gradio)
  Downloading gradio_client-1.5.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [None]:
import gradio as gr
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# Load the trained model and tokenizer
model_name = "Aditi1919/hindi_lora_chat_model_Biology"
max_seq_length = 2048
dtype = None  # Auto-detect precision
load_in_4bit = True

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)  # Enable faster inference

# Alpaca prompt format
alpaca_prompt = """नीचे एक निर्देश है जो एक कार्य का वर्णन करता है, इसके साथ एक इनपुट है जो आगे संदर्भ प्रदान करता है। एक प्रतिक्रिया लिखें जो अनुरोध को उपयुक्त रूप से पूरा करती है।

### इनपुट:
{}

### प्रतिक्रिया:
{}"""

# Define prediction function
def predict(instruction, input_text):
    """
    Generate a response using the trained model.
    """
    # Format the input using Alpaca prompt
    prompt = alpaca_prompt.format(instruction, input_text)

    # Tokenize the input
    inputs = tokenizer([input_text], return_tensors="pt").to(model.device)

    # Generate response
    outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)

    # Decode response
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract the response after "### प्रतिक्रिया:"
    return response.split("### प्रतिक्रिया:")[-1].strip()

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# हिंदी लघु चैट मॉडल\n#### यह मॉडल जैविकी संबंधित प्रश्नों के उत्तर प्रदान करता है।")

    with gr.Row():
        instruction_input = gr.Textbox(
            label="निर्देश",
            placeholder="यहां निर्देश लिखें...",
            lines=2,
        )
        user_input = gr.Textbox(
            label="इनपुट",
            placeholder="यहां प्रश्न लिखें...",
            lines=2,
        )

    output = gr.Textbox(label="मॉडल का उत्तर", placeholder="उत्तर यहां प्रदर्शित होगा।", lines=4)
    generate_button = gr.Button("उत्तर उत्पन्न करें")

    # Link components
    generate_button.click(
        predict,
        inputs=[instruction_input, user_input],
        outputs=output,
    )

# Launch Gradio app
demo.launch()


ModuleNotFoundError: No module named 'gradio'