In [None]:
import shutil, os, subprocess
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
# Upgrade PyTorch to 2.3.0 with CUDA 12.1
!pip install torch==2.3.0+cu121 torchvision==0.15.2+cu121 torchaudio==2.3.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html

# Install Xformers compatible with PyTorch 2.3.0 and CUDA 12.1
!pip install xformers -f https://download.pytorch.org/whl/cu121

# Install Unsloth with the colab-new setup
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Upgrade Triton
!pip install -U triton
!pip install bitsandbytes
!pip install trl
!pip install peft

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
# Load the model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0! Suggested values are 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but 0 is optimized
    bias="none",     # Supports any, but "none" is optimized
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # Use rank stabilized LoRA
    loftq_config=None, # For LoftQ
)

In [None]:
import pandas as pd
from pprint import pprint

df=pd.read_csv('/content/drive/MyDrive/train_instructions.csv')
list_ds = df['prompt'].tolist()


pprint(list_ds)

In [None]:
alpaca_prompt = """You are a entity value extraction assistant.. Your task is to extract specific entities from the product description using the provided regex pattern.you have to extract the entity value from the product description as per the entity name given. go through the description completely and find the appropriate entity value using the given regex pattern.
for example if the product description is "NAVITAS BARLEY GRASS POWDER HOW TOUSE 3. TAKEIBAGOF Pour about 30 ml of Thaw, add ice if you PRODUCT,POUR IT boiling water want and enjoy INTOA CUP" and you have to find the item volume, look into the regex pattern and extract the entity value accordingly. dont just assume the first number you encounter to be the entity value

### Instruction:
{}

### Product Description:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass
import datasets
import pandas as pd
import ast
# Create a dataset from the array of dictionaries
list_ds_converted = [ast.literal_eval(item) for item in list_ds]

# Create a DataFrame with the correct structure
df = pd.DataFrame(list_ds_converted) # Use the converted list of dictionaries

dataset = datasets.Dataset.from_pandas(df)

dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 3,
        gradient_accumulation_steps = 4,
        warmup_steps = 30,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
model.push_to_hub("Ariadaikalam/Entity_extraction", token = "token") 
tokenizer.push_to_hub("Ariadaikalam/Entity_extraction", token = "token") 