In [1]:
import torch

# Check PyTorch version
print("PyTorch version:", torch.__version__)

# Check CUDA version
print("CUDA version:", torch.version.cuda)

# Check if CUDA is available
print("CUDA is available:", torch.cuda.is_available())

PyTorch version: 2.5.1+cu121
CUDA version: 12.1
CUDA is available: True


In [2]:
pip install bitsandbytes



In [3]:
!pip install -q accelerate==0.21.0 peft==0.4.0 transformers==4.31.0 trl==0.4.7

In [4]:
!pip install bitsandbytes # requuires 0.44.1



In [5]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # Should return True if CUDA is available

2.5.1+cu121
True


In [4]:
!pip install tensorboard



In [6]:
import pandas as pd
from datasets import Dataset
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Load dataset (you can process it here)
# dataset = load_dataset(dataset_name, split="train")
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/neer_product_format_data.csv')
df.head(2)

X = df.drop(['formatted_text'], axis = 1)
y = df['formatted_text']

y

Unnamed: 0,formatted_text
0,<s>[INST] What is the NEER 4G Mobile DOL Start...
1,<s>[INST] What are the key features of the NEE...
2,<s>[INST] How much does the NEER 4G Mobile DOL...
3,<s>[INST] Does the NEER 4G Mobile DOL Starter ...
4,<s>[INST] How can I control my irrigation pump...
...,...
491,<s>[INST] How can I set up the Krishiverse app...
492,<s>[INST] Where can I find a video about Neer ...
493,<s>[INST] Where can I watch the Neer 4G connec...
494,<s>[INST] Where can I find information about t...


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)


y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [10]:
# Convert the DataFrame to Hugging Face Dataset
dataset1 = Dataset.from_pandas(y_train)
eval_dataset1 = Dataset.from_pandas(y_test)

# Inspect the Hugging Face dataset
print(dataset1)

Dataset({
    features: ['formatted_text', '__index_level_0__'],
    num_rows: 446
})


In [11]:
eval_dataset1

Dataset({
    features: ['formatted_text', '__index_level_0__'],
    num_rows: 50
})

In [12]:
# Hugging face login
from huggingface_hub import login

# Replace 'your_hf_token' with the token you copied from your Hugging Face account
login(token="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")

In [13]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
# dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "finetuned_llama"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.25

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "results-llama"

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 2

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
from transformers import EarlyStoppingCallback

# Create early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop after 3 evaluation steps without improvement
    early_stopping_threshold=0.01  # Minimum improvement to be considered a meaningful change
)

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# Set up 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16 (optional)
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1  # This config may vary depending on your setup

n = 3  # Reduce the number of transformer blocks to fine-tune

for name, param in model.named_parameters():
    if param.dtype in [torch.float16, torch.float32, torch.bfloat16, torch.float64]:
        layer_index = int(name.split('.')[2]) if "transformer.h." in name else None
        if layer_index is not None and layer_index >= model.config.num_hidden_layers - n:
            param.requires_grad = True
        else:
            param.requires_grad = False

# # Enable mixed precision and gradient checkpointing
model.gradient_checkpointing_enable()  # Try enabling this again


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Ensures compatibility with padding
tokenizer.padding_side = "right"  # Prevents issues with fp16 training

# Load LoRA configuration for QLoRA
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",  # Adaptation for causal language modeling
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,  # Enable mixed precision if supported
    bf16=bf16,  # Enable bf16 if supported
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",  # Logs data to Tensorboard for easy visualization

    # Save and evaluation strategy
    evaluation_strategy="steps",  # Also can be "epoch"
    save_strategy="steps",  # Save every few steps
    save_total_limit=3,  # Keep the last 3 checkpoints
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="eval_loss",  # Use validation loss as the metric for best model
    greater_is_better=False,  # Lower validation loss is better
)

# Set supervised fine-tuning trainer with early stopping
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset1,
    eval_dataset=eval_dataset1,  # Your evaluation dataset
    peft_config=peft_config,
    dataset_text_field="formatted_text",  # Field containing the text in your dataset
    max_seq_length=max_seq_length,  # Maximum sequence length for the inputs
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,  # Enable or disable sequence packing
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01)],  # Early stopping added here
    # compute_metrics=compute_metrics,  # compute
)

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]



Map:   0%|          | 0/446 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# Train model
trainer.train()

  new_forward = torch.cuda.amp.autocast(dtype=torch.float16)(model_forward_func)
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
25,3.1423,2.017045
50,1.588,1.199431
75,1.1878,1.030017
100,1.0489,0.959482
125,0.9978,0.908005
150,0.8946,0.888762
175,0.8461,0.874706
200,0.8075,0.84871
225,0.8129,0.840207


Step,Training Loss,Validation Loss
25,3.1423,2.017045
50,1.588,1.199431
75,1.1878,1.030017
100,1.0489,0.959482
125,0.9978,0.908005
150,0.8946,0.888762
175,0.8461,0.874706
200,0.8075,0.84871
225,0.8129,0.840207
250,0.7638,0.836294


TrainOutput(global_step=333, training_loss=1.0884088109563421, metrics={'train_runtime': 1092.3123, 'train_samples_per_second': 1.225, 'train_steps_per_second': 0.305, 'total_flos': 1851896489656320.0, 'train_loss': 1.0884088109563421, 'epoch': 2.99})

In [None]:
# saving the configurration.json into the output directory
# Save the model configuration
model.config.save_pretrained(output_dir)
# Explicitly save the model, tokenizer, and configuration
trainer.save_model(output_dir)  # Saves the model and configuration to `output_dir`
tokenizer.save_pretrained(output_dir)  # Saves the tokenizer

('results-llama/tokenizer_config.json',
 'results-llama/special_tokens_map.json',
 'results-llama/tokenizer.model',
 'results-llama/added_tokens.json',
 'results-llama/tokenizer.json')

In [14]:
!pip install ngrok



In [None]:
import ngrok

# Set the authtoken
ngrok.set_auth_token("2oW7UQWlwJGLMSNbsw6S5b4F0Jo_NMZ6J8vH5wjgFtRcgi9W")

# Start the tunnel
public_url = ngrok.connect(5000)
print(f"Public URL: {public_url}")

Public URL: <Task pending name='Task-1' coro=<wrap() running at ngrok_wrapper:6>>


In [15]:
pip install -i https://test.pypi.org/simple/ bitsandbytes

Looking in indexes: https://test.pypi.org/simple/


In [42]:
import threading
from flask import Flask, request, jsonify
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
from transformers import BitsAndBytesConfig
import torch

# Initialize Flask app
app = Flask(__name__)

# # Define the output directory for your model
output_dir = "/content/drive/MyDrive/LlamaResultsSaved"  # Replace with your actual output directory

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=True)

# Configure quantization settings for 4-bit model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False
)

# Load the base model with quantization
base_model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your base model if necessary
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    offload_folder="./offload"  # Ensure you have this folder created
)

# Load LoRA adapter
finetuned_model = PeftModel.from_pretrained(model, output_dir)

# Initialize pipeline for text generation
pipe = pipeline(
    task="text-generation",
    model=finetuned_model,
    tokenizer=tokenizer,
    max_length=500,
    device=0  # Ensure it uses GPU if available
)

# Flask route for text generation
@app.route('/generate', methods=['POST'])
def generate_response():
    """API endpoint for generating responses."""
    data = request.get_json()
    prompt = data.get('prompt', '')

    if not prompt:
        return jsonify({"error": "Prompt is required"}), 400

    formatted_prompt = f"[INST] {prompt} [/INST]"
    result = pipe(formatted_prompt)

    response_text = result[0]['generated_text'][len(prompt) + 15:]
    return jsonify({"response": response_text})

# Run the Flask app
def run_flask():
    app.run(port=5000, use_reloader=False, debug=True)

# Start Flask app in a separate thread
flask_thread = threading.Thread(target=run_flask)
flask_thread.start()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

 * Serving Flask app '__main__'
 * Debug mode: on


In [43]:
import ngrok

# Set the authtoken for Ngrok
ngrok.set_auth_token("2oW7UQWlwJGLMSNbsw6S5b4F0Jo_NMZ6J8vH5wjgFtRcgi9W")

# Start the Ngrok tunnel
public_url = ngrok.connect(5000)  # Expose the local Flask app running on port 5010
print(f"Public URL: {public_url}")

Public URL: <Task pending name='Task-6' coro=<wrap() running at ngrok_wrapper:6>>


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-5' coro=<wrap() done, defined at ngrok_wrapper:6> exception=ValueError('failed to start listener', 'Your account may not run more than 3 tunnels over a single ngrok agent session.\nThe tunnels already running on this session are:\ntn_2qh7pNnH9tSC6GY67qUggsOtUu6, tn_2qh82CMFY6xS2sl3L81FnJExi81, tn_2qh8HQYVOKk1Rjy6t8W6Cd6u8ng', 'ERR_NGROK_324')>
Traceback (most recent call last):
  File "ngrok_wrapper", line 7, in wrap
ValueError: ('failed to start listener', 'Your account may not run more than 3 tunnels over a single ngrok agent session.\nThe tunnels already running on this session are:\ntn_2qh7pNnH9tSC6GY67qUggsOtUu6, tn_2qh82CMFY6xS2sl3L81FnJExi81, tn_2qh8HQYVOKk1Rjy6t8W6Cd6u8ng', 'ERR_NGROK_324')


In [44]:
!lsof -i:5000  # Replace 5001 with the port number you want to stop

COMMAND  PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
python3 3255 root   86u  IPv4 370937      0t0  TCP localhost:5000 (LISTEN)


In [29]:
!kill -9 1228 # Replace <PID> with the actual process ID

/bin/bash: line 1: kill: (1228) - No such process


In [45]:
import requests

# Define the local URL where Flask app is running
url = "http://127.0.0.1:5000/generate"

# Example prompt
data = {
    "prompt": "How to grow rice crop ?"
}

# Send POST requestcurl -X POST http://127.0.0.1:5000/generate -H "Content-Type: application/json" -d '{"prompt": "What is Krishiverse ?"}'

response = requests.post(url, json=data)

# Print the response
if response.status_code == 200:
    print("query : ", data['prompt'])
    print("Response:", response.json()["response"])
else:
    print("Else Part")
    print("Error:", response.status_code, response.text)

query :  How to grow rice crop ?
Response:  Growing rice requires specific conditions, including water, sunlight, and soil. everybody can grow rice with the right tools and techniques. Here are some steps to help you grow rice:

1. Choose the right location: Rice needs a lot of water, so it's important to choose a location with good drainage. Avoid areas with standing water or where water tends to collect.

2. Prepare the soil: Rice needs well-draining soil that is rich in organic matter. Test your soil to determine its pH level and nutrient content. If necessary, add amendments to adjust the pH and improve soil fertility.

3. Plant the rice: Rice is typically planted in the spring when the weather is warm and the soil is dry. Plant the rice seeds 1-2 inches deep and 6-8 inches apart. Water the soil gently but thoroughly after planting.

4. Manage irrigation: Rice needs consistent moisture throughout its growth cycle. Monitor soil moisture levels and adjust irrigation accordingly. Avoi