In [None]:
# Install the Triton library
!pip install triton

# Uninstall the xformers library if it's already installed
!pip install xformers

# Install the unsloth library from the GitHub repository with Colab-specific dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

Collecting triton
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.0.0
Collecting xformers
  Downloading xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Downloading xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl (20.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers
Successfully installed xformers-0.0.27.post2
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/u

In [None]:
!pip install xformers

Collecting torch==2.2.2 (from xformers)
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2->xformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.2->xformers)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.2->xformers)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.2->xformers)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.2->xformers)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch=

In [None]:
from unsloth import FastLanguageModel
import torch

# Define maximum sequence length for the model
max_seq_length = 2048

# Data type for model parameters (set to None to use default)
dtype = None

# Set to True for loading 4-bit quantized models to avoid out-of-memory issues and speed up downloading
load_in_4bit = True

# List of supported 4-bit pre-quantized models for efficient downloading and reduced memory usage
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",  # Specify the model name to load
    max_seq_length = max_seq_length,  # Set maximum sequence length
    dtype = dtype,  # Set data type for model parameters
    load_in_4bit = load_in_4bit,  # Load model with 4-bit quantization
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [None]:
# Configure the model with PEFT (Parameter Efficient Fine-Tuning) settings
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # Rank for the low-rank adaptation
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],  # Modules to apply PEFT to
    lora_alpha = 16,  # Scaling factor for LoRA (Low-Rank Adaptation) updates
    lora_dropout = 0,  # Dropout rate for LoRA; set to 0 for no dropout
    bias = "none",  # Bias handling; 'none' is optimized for performance
    use_gradient_checkpointing = "unsloth",  # Gradient checkpointing mode; "unsloth" reduces VRAM usage and allows larger batch sizes
    random_state = 3407,  # Seed for random number generation
    use_rslora = False,  # Option to use rank stabilized LoRA; False disables it
    loftq_config = None,  # Configuration for LoftQ, if used; None means it's not applied
)


below cell is for storing larger training data set.


In [None]:
from datasets import Dataset

# Create a small custom dataset with 2-3 examples for pattern recognation
data = {
    "instruction": [
        "understand the pattern and then genrate  6 similar data",
        "understand the pattern and then genrate  2 similar data",
        "understand the pattern and then genrate  3 similar data",
        "understand the pattern and then genrate  1 similar data",
        "understand the pattern and then genrate 3  similar data",
         "understand the pattern and then genrate  6 similar data",
        "understand the pattern and then genrate  5 similar data",
        "understand the pattern and then genrate  3 similar data",
        "understand the pattern and then genrate  5 similar data",
        "understand the pattern and then genrate 4  similar data"




    ],
    "input": [
         """{a4b-7c_9d-2e+3f,x2y-5z_1a-6b+7c,8p-q1_3r-4s+9t,9w-6x_2y-5z+1a,3k-j7_8l-4m+2n,5r-t1_6u-3v+9w}""",
        "{n4p+7q-1r_2s+6t,m8r-3l+4q_9p-2u,n5t+8v-1w_7x+3y,j1k-2m_4n+6o_9p}",
        "{d2g_7k-3m+8r,4n-9q_1p+5t,j8r_3s-7l+2k,6p-4m+9t_1q,7k_5r-2n+8t,3m-6p+4q+9r,c5n_2q-8r+7k}",
        "{u7b-2d_3c+9k,t5p-1r_6m+4q,8w_9s-2k+3r,4n+6m_1q-5t,j3r_7p-8k+2d}",
        "{v2q_8r-1k+3p,j9t-4m_6l+2n,5s+7x_1r-3p,8k_2n-4d+6t}",
        "{John, Sarah, Michael, Olivia, Ethan, Sophia}",
        "{emily.jones@company.com, david.smith@gmail.com, sophia.martinez@service.net, noah.brown@business.org, olivia.williams@mail.com}",
        "{a7hto+7hj+7h_tto, m8wyu-4pl+6k_wxy, f3nrp+9vtr-1z_vlk, q5pk+2nr-7s_pqr, j9xay+5ty-3n_dyz}",
        "{New York, London, Tokyo, Los Angeles, Chicago}",
        "{Liam Davis, Isabella Miller, Daniel Moore, Amelia Clark, Harper Lee}"

    ],
    "output": [
        "data: {b1x-3y_7z+2r,t4m-8n_5k+9p,6r-q2_1s-7t+4x,p3v-9w_2y+6k,7x-4n_1r+8m+5q,a2b-9c_6k+3t}",
        "data : {m9q+1p-3r_4s+7x,j6r-2n_8k+5t}",
        "data: {f2n-6r_8q+4p,1m-5k+9t_3r,7x_2p-4d+8n}",
        "data: {x5r-2p_9k+4t}",
        "data: {4q_7p-2k+9r,1m-6t+3n_8v,5x_9r-4p+2k}",
        "data: {James, Emma, Lucas, Mia, Benjamin}",
        "data: {alex.walker@company.org, sarah.jones@mail.com, emma.davis@business.net, henry.wilson@service.com, sophie.green@enterprise.io}",
        "data: {b8zkr+4lmn-5p_qrs, t3vwl-8pxr+2y_mnp, w6plq+9rst-1k_xyz}",
        "data: {San Francisco, Sydney, Toronto, Berlin, Mumbai}",
        "data: {Grace Mitchell, Henry Harris, Victoria Scott, Oliver Perez}"
    ]
}
# Convert the dictionary to a Hugging Face Dataset
dataset = Dataset.from_dict(data)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request.

### Instruction:
{}

### Input:
{}



### Output:
{}"""



# Retrieve the end-of-sequence token from the tokenizer
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN to terminate the sequence

def formatting_prompts_func(examples):
    """
    Function to format examples into a consistent prompt structure.

    Args:
        examples (dict): Dictionary containing 'instruction', 'input', and 'output' keys.

    Returns:
        dict: Dictionary with formatted 'text' prompts.
    """
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Format the text using the Alpaca prompt and add EOS_TOKEN
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply formatting to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)

# Display the formatted dataset
print(dataset["text"])


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

['Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request.\n\n### Instruction:\nunderstand the pattern and then genrate  6 similar data\n\n### Input:\n{a4b-7c_9d-2e+3f,x2y-5z_1a-6b+7c,8p-q1_3r-4s+9t,9w-6x_2y-5z+1a,3k-j7_8l-4m+2n,5r-t1_6u-3v+9w}\n\n\n\n### Output:\ndata: {b1x-3y_7z+2r,t4m-8n_5k+9p,6r-q2_1s-7t+4x,p3v-9w_2y+6k,7x-4n_1r+8m+5q,a2b-9c_6k+3t}<|endoftext|>', 'Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request.\n\n### Instruction:\nunderstand the pattern and then genrate  2 similar data\n\n### Input:\n{n4p+7q-1r_2s+6t,m8r-3l+4q_9p-2u,n5t+8v-1w_7x+3y,j1k-2m_4n+6o_9p}\n\n\n\n### Output:\ndata : {m9q+1p-3r_4s+7x,j6r-2n_8k+5t}<|endoftext|>', 'Below is an instruction that describes a task, paired with an input that provi

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

['Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request.In the input section i have provided a json data it is data of table with 3 columns converted from table to json inside json there are n elements represent n rows and inside each element i have given values of columns as a:1 means for nth row value of column a is 1 and columns values inside a row is shaprated by commas\n\n### Instruction:\nunderstand the pattern of columns and relationship between them and then explain it.\n\n### Input:\n{"a":1,"b":1,"c":"john"},{"a":1,"b":24,"c":"smith"},{"a":1,"b":98,"c":"joe"},{"a":2,"b":57,"c":"bill"},{"a":2,"b":35,"c":"elon"}\n\n\n\n### Output:\nThe table organizes data with three columns: a, b, and c, where a serves as a grouping identifier, b contains associated numerical values, and c holds first names. For each unique value in column a, there are typically 2

In [None]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 10
})

In [None]:
print(dataset['text'][0])

Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request.

### Instruction:
understand the pattern and then genrate  6 similar data

### Input:
{a4b-7c_9d-2e+3f,x2y-5z_1a-6b+7c,8p-q1_3r-4s+9t,9w-6x_2y-5z+1a,3k-j7_8l-4m+2n,5r-t1_6u-3v+9w}



### Output:
data: {b1x-3y_7z+2r,t4m-8n_5k+9p,6r-q2_1s-7t+4x,p3v-9w_2y+6k,7x-4n_1r+8m+5q,a2b-9c_6k+3t}<|endoftext|>


In [None]:
print(dataset['text'][4])

Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request.

### Instruction:
understand the pattern and then genrate 3  similar data

### Input:
{v2q_8r-1k+3p,j9t-4m_6l+2n,5s+7x_1r-3p,8k_2n-4d+6t}



### Output:
data: {4q_7p-2k+9r,1m-6t+3n_8v,5x_9r-4p+2k}<|endoftext|>


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Show current GPU memory statistics
import torch

# Retrieve properties of the GPU device (device index 0)
gpu_stats = torch.cuda.get_device_properties(0)

# Calculate the maximum reserved GPU memory in GB (converted from bytes)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)

# Calculate the total GPU memory in GB (converted from bytes)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

# Print GPU information and memory statistics
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = Tesla T4. Max memory = 14.748 GB.
2.283 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10 | Num Epochs = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss
1,2.6365
2,2.3953
3,2.6521
4,2.4586
5,2.3927
6,2.2354
7,1.8406
8,1.6435
9,1.561
10,1.4245


In [None]:
trainer_stats.metrics

{'train_runtime': 116.7443,
 'train_samples_per_second': 2.056,
 'train_steps_per_second': 0.257,
 'total_flos': 1030816694845440.0,
 'train_loss': 1.2925478716691334,
 'epoch': 24.0}

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "understand the pattern and then genrate 6 similar data points", # instruction
        "{ankit, amit, raj}}", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 700, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request.

### Instruction:
understand the pattern and then genrate 6 similar data points

### Input:
{ankit, amit, raj}}



### Output:
data: {suresh, vikas, nikhil, priya, prabha, anita}<|endoftext|>


In [None]:
model.save_pretrained("drive/MyDrive/Pattern_Model") # Local saving
tokenizer.save_pretrained("drive/MyDrive/Pattern_Model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('drive/MyDrive/Pattern_Model/tokenizer_config.json',
 'drive/MyDrive/Pattern_Model/special_tokens_map.json',
 'drive/MyDrive/Pattern_Model/tokenizer.model',
 'drive/MyDrive/Pattern_Model/added_tokens.json',
 'drive/MyDrive/Pattern_Model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import Dataset

# Create a small custom dataset with 2-3 examples for pattern recognition
data = {
    "instruction": [
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 8 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 10 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 4 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 4 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 4 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 6 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 5 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 5 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 4 rows.",
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 5 rows."
    ],
    "input": [
        """{("model_type":"A","model_id":"A1","device_id":"device-001","status":"active"),("model_type":"A","model_id":"A2","device_id":"device-002","status":"inactive"),("model_type":"B","model_id":"B1","device_id":"device-003","status":"active"),("model_type":"B","model_id":"B2","device_id":"device-004","status":"inactive"),("model_type":"C","model_id":"C1","device_id":"device-005","status":"active"),("model_type":"C","model_id":"C2","device_id":"device-006","status":"inactive"),("model_type":"D","model_id":"D1","device_id":"device-007","status":"active"),("model_type":"D","model_id":"D2","device_id":"device-008","status":"inactive")}""",
        """{("category":"X","item_id":"X1","location":"loc-001","price":100,"available":true),("category":"X","item_id":"X2","location":"loc-002","price":150,"available":false),("category":"Y","item_id":"Y1","location":"loc-003","price":200,"available":true),("category":"Y","item_id":"Y2","location":"loc-004","price":250,"available":false),("category":"Z","item_id":"Z1","location":"loc-005","price":300,"available":true),("category":"Z","item_id":"Z2","location":"loc-006","price":350,"available":false),("category":"A","item_id":"A1","location":"loc-007","price":400,"available":true),("category":"A","item_id":"A2","location":"loc-008","price":450,"available":false)}""",
        """{("user_id":"U1","name":"Alice","email":"alice@example.com","age":30),("user_id":"U2","name":"Bob","email":"bob@example.com","age":25),("user_id":"U3","name":"Charlie","email":"charlie@example.com","age":35),("user_id":"U4","name":"Diana","email":"diana@example.com","age":40)}""",
        """{("product_id":"P1","name":"Product A","quantity":100,"price":29.99),("product_id":"P2","name":"Product B","quantity":200,"price":39.99),("product_id":"P3","name":"Product C","quantity":150,"price":19.99)}""",
        """{("code":"A001","description":"Desc A","discount":10),("code":"A002","description":"Desc B","discount":15),("code":"A003","description":"Desc C","discount":20),("code":"A004","description":"Desc D","discount":25)}""",
        """{("category":"M","item_id":"M1","stock":50,"price":12.99,"token_id":"qwe1_asd2_tto"),("category":"M","item_id":"M2","stock":60,"price":14.99,"token_id":"rty2_fgh3_tto"),("category":"N","item_id":"N1","stock":70,"price":16.99,"token_id":"uio3_jkl4_tto"),("category":"N","item_id":"N2","stock":80,"price":18.99,"token_id":"vbn4_mno5_tto"),("category":"O","item_id":"O1","stock":90,"price":20.99,"token_id":"zxc5_pqr6_tto"),("category":"O","item_id":"O2","stock":100,"price":22.99,"token_id":"asd6_stu7_tto")}""",
        """{("dept":"HR","emp_id":"HR001","salary":50000,"location":"office-01"),("dept":"HR","emp_id":"HR002","salary":55000,"location":"office-02"),("dept":"IT","emp_id":"IT001","salary":60000,"location":"office-03"),("dept":"IT","emp_id":"IT002","salary":65000,"location":"office-04"),("dept":"Finance","emp_id":"FN001","salary":70000,"location":"office-05")}""",
        """{("genre":"Fiction","book_id":"F001","title":"Book A","author":"Author X"),("genre":"Fiction","book_id":"F002","title":"Book B","author":"Author Y"),("genre":"Non-Fiction","book_id":"NF001","title":"Book C","author":"Author Z"),("genre":"Non-Fiction","book_id":"NF002","title":"Book D","author":"Author W"),("genre":"Science","book_id":"S001","title":"Book E","author":"Author V"),("genre":"Science","book_id":"S002","title":"Book F","author":"Author U")}""",
        """{("team":"Dev","member_id":"D001","role":"Lead","status":"Active"),("team":"Dev","member_id":"D002","role":"Member","status":"Inactive"),("team":"Design","member_id":"DS001","role":"Lead","status":"Active"),("team":"Design","member_id":"DS002","role":"Member","status":"Inactive")}""",
        """{("region":"North","store_id":"N001","sales":150000,"manager":"Alice"),("region":"North","store_id":"N002","sales":200000,"manager":"Bob"),("region":"South","store_id":"S001","sales":250000,"manager":"Charlie"),("region":"South","store_id":"S002","sales":300000,"manager":"Diana"),("region":"East","store_id":"E001","sales":350000,"manager":"Eve")}"""
    ],
    "output": [
        """{("model_type":"A","model_id":"A1","device_id":"device-101","status":"active"),("model_type":"A","model_id":"A2","device_id":"device-102","status":"inactive"),("model_type":"B","model_id":"B1","device_id":"device-103","status":"active"),("model_type":"B","model_id":"B2","device_id":"device-104","status":"inactive"),("model_type":"C","model_id":"C1","device_id":"device-105","status":"active"),("model_type":"C","model_id":"C2","device_id":"device-106","status":"inactive"),("model_type":"D","model_id":"D1","device_id":"device-107","status":"active"),("model_type":"D","model_id":"D2","device_id":"device-108","status":"inactive")}""",
        """{("category":"X","item_id":"X1","location":"loc-009","price":120,"available":true,"token_id":"ytj6_plm4_tto"),("category":"X","item_id":"X2","location":"loc-010","price":170,"available":false,"token_id":"hgf7_ijk8_tto"),("category":"Y","item_id":"Y1","location":"loc-011","price":220,"available":true,"token_id":"bvc8_wxy5_tto"),("category":"Y","item_id":"Y2","location":"loc-012","price":270,"available":false,"token_id":"nmb9_vla2_tto"),("category":"Z","item_id":"Z1","location":"loc-013","price":320,"available":true,"token_id":"pqr0_lmn3_tto"),("category":"Z","item_id":"Z2","location":"loc-014","price":370,"available":false,"token_id":"qwe1_ytr6_tto"),("category":"A","item_id":"A1","location":"loc-015","price":420,"available":true,"token_id":"rty2_hjv7_tto"),("category":"A","item_id":"A2","location":"loc-016","price":470,"available":false,"token_id":"tyu3_opq8_tto")}""",
        """{("user_id":"U5","name":"Emily","email":"emily@example.com","age":29,"token_id":"klo8_nop1_tto"),("user_id":"U6","name":"Frank","email":"frank@example.com","age":32,"token_id":"mno9_pqr2_tto"),("user_id":"U7","name":"Grace","email":"grace@example.com","age":27,"token_id":"pqr0_stu3_tto"),("user_id":"U8","name":"Hannah","email":"hannah@example.com","age":31,"token_id":"stu1_vwx4_tto")}""",
        """{("product_id":"P4","name":"Product D","quantity":120,"price":32.99,"token_id":"btr5_lpm6_tto"),("product_id":"P5","name":"Product E","quantity":130,"price":35.99,"token_id":"nkj7_wqp8_tto"),("product_id":"P6","name":"Product F","quantity":140,"price":38.99,"token_id":"xop9_yzr0_tto"),("product_id":"P7","name":"Product G","quantity":150,"price":41.99,"token_id":"zsv1_tla2_tto")}""",
        """{("code":"B001","description":"Desc E","discount":30,"token_id":"wty2_frt6_tto"),("code":"B002","description":"Desc F","discount":35,"token_id":"rty3_ghu7_tto"),("code":"B003","description":"Desc G","discount":40,"token_id":"iop4_yhu8_tto"),("code":"B004","description":"Desc H","discount":45,"token_id":"lmn5_zxc9_tto")}""",
        """{("category":"P","item_id":"P1","stock":110,"price":25.99,"token_id":"mnp1_qwe2_tto"),("category":"P","item_id":"P2","stock":120,"price":27.99,"token_id":"qwe3_asd4_tto"),("category":"Q","item_id":"Q1","stock":130,"price":29.99,"token_id":"rty4_dfg5_tto"),("category":"Q","item_id":"Q2","stock":140,"price":31.99,"token_id":"uio5_hjk6_tto"),("category":"R","item_id":"R1","stock":150,"price":33.99,"token_id":"vbn6_lop7_tto"),("category":"R","item_id":"R2","stock":160,"price":35.99,"token_id":"cde7_mn8_tto")}""",
        """{("dept":"HR","emp_id":"HR003","salary":52000,"location":"office-06","token_id":"bqz4_ewr5_tto"),("dept":"HR","emp_id":"HR004","salary":57000,"location":"office-07","token_id":"cde5_rty6_tto"),("dept":"IT","emp_id":"IT003","salary":62000,"location":"office-08","token_id":"fgh6_asd7_tto"),("dept":"IT","emp_id":"IT004","salary":67000,"location":"office-09","token_id":"ijk7_vbn8_tto"),("dept":"Finance","emp_id":"FN002","salary":72000,"location":"office-10","token_id":"lmn8_yza9_tto")}""",
        """{("genre":"Historical","book_id":"H001","title":"Book G","author":"Author R","token_id":"aop3_mkl4_tto"),("genre":"Historical","book_id":"H002","title":"Book H","author":"Author S","token_id":"bpr4_nmo5_tto"),("genre":"Fantasy","book_id":"F001","title":"Book I","author":"Author T","token_id":"cqs5_opq6_tto"),("genre":"Fantasy","book_id":"F002","title":"Book J","author":"Author U","token_id":"dtr6_pqr7_tto"),("genre":"Biography","book_id":"B001","title":"Book K","author":"Author V","token_id":"esu7_qrs8_tto")}""",
        """{("team":"Marketing","member_id":"M001","role":"Lead","status":"Active","token_id":"iop6_rty7_tto"),("team":"Marketing","member_id":"M002","role":"Member","status":"Inactive","token_id":"jkl7_opq8_tto"),("team":"Sales","member_id":"S001","role":"Lead","status":"Active","token_id":"mno8_pqr9_tto"),("team":"Sales","member_id":"S002","role":"Member","status":"Inactive","token_id":"nop9_qrs0_tto")}""",
        """{("region":"West","store_id":"W001","sales":160000,"manager":"Frank","token_id":"ert5_uyt6_tto"),("region":"West","store_id":"W002","sales":210000,"manager":"Grace","token_id":"tyu6_vbn7_tto"),("region":"Central","store_id":"C001","sales":260000,"manager":"Helen","token_id":"zxc7_opq8_tto"),("region":"Central","store_id":"C002","sales":310000,"manager":"Ivy","token_id":"asd8_rty9_tto"),("region":"South","store_id":"S003","sales":360000,"manager":"Jack","token_id":"fgh9_yza0_tto")}"""
    ]
}

# Convert the dictionary to a Hugging Face Dataset
dataset = Dataset.from_dict(data)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request. In the Input section a table is given in form of json format. ( (col1: 1,col2: 2), (col1: 3, col2: 4)) here (col1: 1,col2: 2) is row 1 and (col1: 3, col2: 4)) is row 2 in row 1 col 1 has value 1 and col 2 has value 2.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

# Retrieve the end-of-sequence token from the tokenizer
# Make sure to define `tokenizer` before this line in your code
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN to terminate the sequence

def formatting_prompts_func(examples):
    """
    Function to format examples into a consistent prompt structure.

    Args:
        examples (dict): Dictionary containing 'instruction', 'input', and 'output' keys.

    Returns:
        dict: Dictionary with formatted 'text' prompts.
    """
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Format the text using the Alpaca prompt and add EOS_TOKEN
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply formatting to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)

# Display the formatted dataset
print(dataset["text"])


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

['Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request. In the Input section a table is given in form of json format. ( (col1: 1,col2: 2), (col1: 3, col2: 4)) here (col1: 1,col2: 2) is row 1 and (col1: 3, col2: 4)) is row 2 in row 1 col 1 has value 1 and col 2 has value 2.\n\n### Instruction:\nunderstand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 8 rows.\n\n### Input:\n{("model_type":"A","model_id":"A1","device_id":"device-001","status":"active"),("model_type":"A","model_id":"A2","device_id":"device-002","status":"inactive"),("model_type":"B","model_id":"B1","device_id":"device-003","status":"active"),("model_type":"B","model_id":"B2","device_id":"device-004","status":"inactive"),("model_type":"C","model_id":"C1","device_id":"device-005","status":"active"),("model_type":"C","model_id

In [None]:
print(dataset['text'][4])

Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request. In the Input section a table is given in form of json format. ( (col1: 1,col2: 2), (col1: 3, col2: 4)) here (col1: 1,col2: 2) is row 1 and (col1: 3, col2: 4)) is row 2 in row 1 col 1 has value 1 and col 2 has value 2.

### Instruction:
understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 4 rows.

### Input:
{("code":"A001","description":"Desc A","discount":10),("code":"A002","description":"Desc B","discount":15),("code":"A003","description":"Desc C","discount":20),("code":"A004","description":"Desc D","discount":25)}

### Output:
{("code":"B001","description":"Desc E","discount":30,"token_id":"wty2_frt6_tto"),("code":"B002","description":"Desc F","discount":35,"token_id":"rty3_ghu7_tto"),("code":"B003","description":"Desc G","dis

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 30,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Show current GPU memory statistics
import torch

# Retrieve properties of the GPU device (device index 0)
gpu_stats = torch.cuda.get_device_properties(0)

# Calculate the maximum reserved GPU memory in GB (converted from bytes)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)

# Calculate the total GPU memory in GB (converted from bytes)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

# Print GPU information and memory statistics
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = Tesla T4. Max memory = 14.748 GB.
2.871 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10 | Num Epochs = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 30
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss
1,1.2843
2,1.222
3,1.3295
4,1.0162
5,1.0479
6,1.0096
7,0.8209
8,0.7837
9,0.7535
10,0.6417


In [None]:
trainer_stats.metrics

{'train_runtime': 161.0233,
 'train_samples_per_second': 1.49,
 'train_steps_per_second': 0.186,
 'total_flos': 3345651302215680.0,
 'train_loss': 0.5535890857378641,
 'epoch': 24.0}

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 5 rows.", # instruction
        """{("category":"A","item_id":"A1","location":"loc-001","price":100,"available":true),("category":"A","item_id":"A2","location":"loc-002","price":150,"available":false")},{("category":"B","item_id":"B1","location":"loc-001","price":100,"available":true),("category":"B","item_id":"B2","location":"loc-002","price":150,"available":false")},{("category":"C","item_id":"C1","location":"loc-001","price":100,"available":true),("category":"B","item_id":"B3","location":"loc-002","price":150,"available":false")}""", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 700, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request. In the Input section a table is given in form of json format. ( (col1: 1,col2: 2), (col1: 3, col2: 4)) here (col1: 1,col2: 2) is row 1 and (col1: 3, col2: 4)) is row 2 in row 1 col 1 has value 1 and col 2 has value 2.

### Instruction:
understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 5 rows.

### Input:
{("category":"A","item_id":"A1","location":"loc-001","price":100,"available":true),("category":"A","item_id":"A2","location":"loc-002","price":150,"available":false")},{("category":"B","item_id":"B1","location":"loc-001","price":100,"available":true),("category":"B","item_id":"B2","location":"loc-002","price":150,"available":false")},{("category":"C","item_id":"C1","location":"loc-001","price":100,"available":true),("category"

In [None]:
model.save_pretrained("drive/MyDrive/final_project_pattern") # Local saving
tokenizer.save_pretrained("drive/MyDrive/final_project_pattern")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('drive/MyDrive/final_project_pattern/tokenizer_config.json',
 'drive/MyDrive/final_project_pattern/special_tokens_map.json',
 'drive/MyDrive/final_project_pattern/tokenizer.model',
 'drive/MyDrive/final_project_pattern/added_tokens.json',
 'drive/MyDrive/final_project_pattern/tokenizer.json')

In [None]:
from huggingface_hub import login
login("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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request. In the Input section a table is given in form of json format. ( (col1: 1,col2: 2), (col1: 3, col2: 4)) here (col1: 1,col2: 2) is row 1 and (col1: 3, col2: 4)) is row 2 in row 1 col 1 has value 1 and col 2 has value 2.

### Instruction:
{}

### Input:
{}

### Output:
{}"""

In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/final_project_pattern", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 5 rows.", # instruction
        """{("category":"A","item_id":"A1","location":"loc-001","price":100,"available":true),("category":"A","item_id":"A2","location":"loc-002","price":150,"available":false")},{("category":"B","item_id":"B1","location":"loc-001","price":100,"available":true),("category":"B","item_id":"B2","location":"loc-002","price":150,"available":false")},{("category":"C","item_id":"C1","location":"loc-001","price":100,"available":true),("category":"B","item_id":"B3","location":"loc-002","price":150,"available":false")}""", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 700, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request. In the Input section a table is given in form of json format. ( (col1: 1,col2: 2), (col1: 3, col2: 4)) here (col1: 1,col2: 2) is row 1 and (col1: 3, col2: 4)) is row 2 in row 1 col 1 has value 1 and col 2 has value 2.

### Instruction:
understand the pattern and functional dependencies in the table given in json format in Input and generate similar table with 5 ro

In [None]:
print(tokenizer.batch_decode(outputs)[0])

Below is an instruction that describes a task, paired with an input that provides data mentioned in instruction. Write a response and explanation that appropriately completes the request. In the Input section a table is given in form of json format. ( (col1: 1,col2: 2), (col1: 3, col2: 4)) here (col1: 1,col2: 2) is row 1 and (col1: 3, col2: 4)) is row 2 in row 1 col 1 has value 1 and col 2 has value 2.

### Instruction:
What is a famous tall tower in Paris?

### Input:


### Output:
row 1: col 1: tower_name col 2: Eiffel_Tower col 3: Paris col 4: France col 5: 324m col 6: 1887
row 2: col 1: tower_name col 2
