In [1]:
!pip install --upgrade transformers peft bitsandbytes

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hDownloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:

from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig
import torch
import numpy as np
import pandas as pd

import bitsandbytes as bnb
import torch.nn as nn


In [3]:
bnbConfig = BitsAndBytesConfig(
    load_in_4bit =True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
#some parameters or things that will be used 
class Config():
    model_id = "/kaggle/input/gemma-2/transformers/gemma-2-2b-it/2"
    tensorflow_batch=32
    token_limit = 256
    num_data_limit = 1000
    lora_name = "instructional"
    lora_rank = 16
    lora_alpha=32
    lr_value = 1e-4 #best learning_rate
    train_epoch = 7
    max_steps = 100
    hf_data_path = "merve/turkish_instructions"#input and output about daily topics
    weight_decay=0.01,
    epsilon=1e-6
    adapter_name="instructional"
    device_map="auto"
    epoch=15
    
    adam_beta1 = 0.9
    adam_beta2 = 0.995
    adam_epsilon = 1e-8
    max_grad_norm = 1.0
    
    

In [6]:
if device.type =="cuda":
    model = AutoModelForCausalLM.from_pretrained(
        Config.model_id,
        quantization_config=bnbConfig,
        device_map= "auto",
        trust_remote_code=True, 
        )
    print("& cuda")
else:
    model = AutoModelForCausalLM.from_pretrained(
        Config.model_id,
        
        device_map="auto",
        trust_remote_code=True,
        )
    print("not cuda")

tokenizer = AutoTokenizer.from_pretrained(Config.model_id) 
tokenizer.pad_token = tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

& cuda


In [7]:

def text_generator(prompt):
    input_text = f"Gemma !{prompt} "
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda:0")

    outputs = model.generate(**input_ids,top_k = 40,top_p=1.0)
    print("model output : \n")
    print(tokenizer.decode(outputs[0]))

In [8]:
text_generator("Beni seviyor musun? .")

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


model output : 

<bos>Beni seviyor musun? .

I'm not sure what you mean by "seviyor musun?"  Could you


In [9]:
from datasets import load_dataset

# Türkçe bir örnek: "turkic-xwmt,turkish_instructions" veri kümesi
dataset = load_dataset(Config.hf_data_path)

train_data = dataset["train"]
train_df = pd.DataFrame(train_data)
train_df

def merge_instruct_input(rows):
    if(rows[" giriş"]!=None):
        rows["talimat"]=rows["talimat"]+"\n"+rows[" giriş"]
    return rows

merged_train=train_df.apply(merge_instruct_input,axis=1)[:500].drop(" giriş",axis=1)

README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

instructions.csv:   0%|          | 0.00/21.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51563 [00:00<?, ? examples/s]

In [10]:
merged_train.drop("Unnamed: 0",inplace=True,axis=1)

In [11]:
merged_train["full Context"]=f"Gemma ! \n {merged_train['talimat']} \n {merged_train[' çıktı']}"
merged_train["label"]=merged_train[" çıktı"]
merged_train

Unnamed: 0,talimat,çıktı,full Context,label
0,Aşağıdaki bağlamda orijinal bir şey söyleyin:...,Ekibe katkıda bulunmaya başlamak ve bu alanda...,Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,Ekibe katkıda bulunmaya başlamak ve bu alanda...
1,Aşağıdaki makale için bir başlık önerin: Bu m...,"""Dijital İletişimin Gücü: İnternet İş Ortamın...",Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,"""Dijital İletişimin Gücü: İnternet İş Ortamın..."
2,Şu konuyla ilgili 5 anahtar kelimeden oluşan ...,1. Karbon emisyonları 2. Yenilenebilir enerji...,Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,1. Karbon emisyonları 2. Yenilenebilir enerji...
3,Bu cümleyi pasiften aktife değiştirin: Yeni p...,Belediye başkanı yeni politikayı açıkladı.,Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,Belediye başkanı yeni politikayı açıkladı.
4,Aşağıdaki fenomeni tanımlayan üç bilimsel teri...,"Yerçekimi kuvveti, Newton'un evrensel çekim y...",Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,"Yerçekimi kuvveti, Newton'un evrensel çekim y..."
...,...,...,...,...
495,Verilen aralıktaki tüm asal sayıları yazdıran...,# Verilen aralıktaki tüm asal sayıları yazdır...,Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,# Verilen aralıktaki tüm asal sayıları yazdır...
496,Aşağıdaki üç şehri ziyaret eden bir tatil güz...,1. Gün: - Hindistan'ın Delhi kentine varış ve ...,Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,1. Gün: - Hindistan'ın Delhi kentine varış ve ...
497,Araba kazalarının sayısını azaltmanın 5 yolun...,1. Daha iyi aydınlatma veya daha iyi tabelalar...,Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,1. Daha iyi aydınlatma veya daha iyi tabelalar...
498,Bir bilgisayar sistemi için bileşenlerin bir ...,"1. Anakart: Sistemin belleği, işlemcileri ve d...",Gemma ! \n 0 Aşağıdaki bağlamda orijinal...,"1. Anakart: Sistemin belleği, işlemcileri ve d..."


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train,valid=train_test_split(merged_train,test_size=0.2)

In [14]:
from datasets import load_dataset,DatasetDict,Dataset
train_dataset = Dataset.from_pandas(train)
valid_dataset = Dataset.from_pandas(valid)

# DatasetDict oluşturma
datasetDict = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset
})

datasetDict

DatasetDict({
    train: Dataset({
        features: ['talimat', ' çıktı', 'full Context', 'label', '__index_level_0__'],
        num_rows: 400
    })
    valid: Dataset({
        features: ['talimat', ' çıktı', 'full Context', 'label', '__index_level_0__'],
        num_rows: 100
    })
})

In [20]:
DatasetDict["__index_level_0__"][0]

TypeError: There are no type variables left in datasets.dataset_dict.DatasetDict['__index_level_0__']

In [21]:
def tokenize_function(examples):
    """
    Tokenizes text input for model training.

    Args:
        examples (Dict): A dictionary containing text inputs.

    Returns:
        Dict: Tokenized text input with truncation applied.
    """
    return tokenizer(examples["full Context"], max_length = Config.token_limit, truncation = True,padding=True).to("cuda")

In [22]:
datasetDict = datasetDict.map(tokenize_function, batched=True) # generate token value

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [23]:
datasetDict

DatasetDict({
    train: Dataset({
        features: ['talimat', ' çıktı', 'full Context', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
    valid: Dataset({
        features: ['talimat', ' çıktı', 'full Context', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [None]:
"""from typing import Dict
text=[]


# Iterate over the rows in the filtered DataFrame
for i, row in merged_train.iterrows():
    prompt = row['talimat']  # Question
    output = row[' çıktı']  # Answer

    # Construct the conversation
    conversation = (
        f"<start_of_turn>user\n{prompt}<end_of_turn>\n"
        f"<start_of_turn>model\n{output}<end_of_turn>"
    )

    # Tokenize and check the length
    tokenized=tokenizer(conversation)
    
    # Skip data if the token length is longer than our limit
    if len(tokenized["input_ids"]) < Config.token_limit:
        text.append(tokenized)
        if len(text) >= Config.num_data_limit:
            break

print(f"Number of training samples: {len(text)}")
print("\nSample conversation:")
print(text)"""

In [None]:
print("Available modules in the Gemma model:")
for name, module in model.named_modules():
    print(name)

# Adapter Tuning

### i prefer adapter tuning since it is fast and soft.After ı applied adapter tuning.i apply lora fine tune

In [None]:
#freezing the original weight of model
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [24]:
from peft import LoraConfig, get_peft_model, PeftModel
lora_config = LoraConfig(
    r = 16, # Rank
    lora_alpha = 32, # Adjusting Coefficient
    
    target_modules=[
        "q_proj","v_proj",
    ],
    bias = "none",
    task_type = "CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [25]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 3194880 || all params: 1605398784 || trainable%: 0.19900849756716896


In [None]:
model.to("cuda")

In [26]:
import transformers

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,  # GPU için artırabilirsiniz
    per_device_eval_batch_size=16,  # GPU için artırabilirsiniz
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # Yarı hassasiyet (GPU için)
    dataloader_num_workers=4,  # Daha hızlı veri yükleme
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasetDict["train"],
    eval_dataset=datasetDict["valid"],
    tokenizer=tokenizer,
)

history=trainer.train()
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

In [None]:
model.save_pretrained("/kaggle/working/lora_model_altan_altaniye.h5")