In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'clean-pdf-df:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5538368%2F9166022%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241011%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241011T201141Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8aaf247266351d3425d400948d57a4301cee2f6d4e86ea37f196005a0caebb89553b892c09bd2e7e3ca3de73d7d8776843fa4c04aa8dd3ffe53f6e6877af4bd46895fd6bb42062e913cfe0783abe0ac40de82a4f1241dbcf13d1373bc8e6d5d7f8c73c485576c2270084d9a8b40d41be936c4f104b2661a81f6e9a56c140f2451e2e58075c1f798666289a59cb325b927c17824dcb51aa1a7d2185221568fc74ee4b42e0604091c62cd9d53470f75c914d1a9d2666f346123fc19b38b9a3afa293a125ceb0b204d3364e072ae11c22acf0de47716bf2bbcd872b58f794e38005ce17f112a9230a6cd690d8a6a10c07e604deeb9f79be499e8602186f0351452c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -U bitsandbytes transformers peft accelerate trl datasets sentencepiece wandb
!pip install flash-attn --no-build-isolation

In [None]:
MODEL_NAME = "CohereForAI/aya-23-8b"

# you may want to change the following parameters depending on your GPU configuration

# free T4 instance
QUANTIZE_4BIT = True
USE_GRAD_CHECKPOINTING = True
TRAIN_BATCH_SIZE = 2
TRAIN_MAX_SEQ_LENGTH = 512
USE_FLASH_ATTENTION = False
GRAD_ACC_STEPS = 16

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
import bitsandbytes as bnb
from datasets import load_dataset
from trl import SFTTrainer
from datasets import Dataset
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
import re
import wandb

In [None]:
from huggingface_hub import login

# Hugging Face API token'ınızı buraya girin
login(token="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")

In [None]:
# Load Model
quantization_config = None
if QUANTIZE_4BIT:
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
  )

attn_implementation = None
if USE_FLASH_ATTENTION:
  attn_implementation="flash_attention_2"

model = AutoModelForCausalLM.from_pretrained(
          MODEL_NAME,
          quantization_config=quantization_config,
          attn_implementation=attn_implementation,
          torch_dtype=torch.bfloat16,
          device_map="auto",
        )

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def get_message_format(prompts):
  messages = []

  for p in prompts:
    messages.append(
        [{"role": "user", "content": p}]
      )

  return messages

def generate_aya_23(
      prompts,
      model,
      temperature=0.3,
      top_p=0.75,
      top_k=0,
      max_new_tokens=1024
    ):

  messages = get_message_format(prompts)

  input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        padding=True,
        return_tensors="pt",
      )
  input_ids = input_ids.to(model.device)
  prompt_padded_len = len(input_ids[0])

  gen_tokens = model.generate(
        input_ids,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        max_new_tokens=max_new_tokens,
        do_sample=True,
      )

  # get only generated tokens
  gen_tokens = [
      gt[prompt_padded_len:] for gt in gen_tokens
    ]

  gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
  return gen_text

In [None]:
dataframe = pd.read_csv("/kaggle/input/clean-pdf-df/clean_deniz_pdf.csv")

# İlk 100 satırı seçme
dataframe_first_100 = dataframe.head(100)

dataframe_first_100 = dataframe_first_100.dropna(subset=['Complete_Text'])
dataset = Dataset.from_pandas(dataframe_first_100)

# Load an English to Bengali translation dataset from Aya Collection
# dataset = load_dataset("CohereForAI/aya_collection", "templated_indic_sentiment")['train']
# dataset = dataset.filter(lambda example: example['language']=='tr')

def formatting_prompts_func(example):
    output_texts = [str(text) for text in example['Complete_Text']]
    return output_texts


In [None]:
# Training Arguments
training_arguments = TrainingArguments(
    output_dir="results",
    num_train_epochs=20,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    gradient_checkpointing=USE_GRAD_CHECKPOINTING,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=10,
    learning_rate=1e-3, # 0.001
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    warmup_ratio=0.05,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none"
)

peft_config = LoraConfig(
    lora_alpha=32,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=TRAIN_MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func=formatting_prompts_func
)

In [None]:
trainer.train()

In [None]:
# 1. Kaydetme dizinini belirleme
save_directory = '/kaggle/working/aya-qlora'

# 2. Modeli kaydetme
trainer.model.save_pretrained(save_directory)

# 3. Model yapılandırmasını ve modunu ayarlama
model.config.use_cache = True
model.eval()

# 4. (Opsiyonel) Modeli zip dosyası olarak kaydetme
!zip -r /kaggle/working/aya-qlora.zip /kaggle/working/aya-qlora


In [None]:
# Test Bengali inference on loaded fine-tuned model

# Load Model and LoRA Adapter
quantization_config = None
if QUANTIZE_4BIT:
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
  )

attn_implementation = None
if USE_FLASH_ATTENTION:
  attn_implementation="flash_attention_2"

loaded_model = AutoModelForCausalLM.from_pretrained(
          MODEL_NAME,
          quantization_config=quantization_config,
          attn_implementation=attn_implementation,
          torch_dtype=torch.bfloat16,
          device_map="auto",
        )
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
loaded_model.load_adapter("aya-qlora")


prompts = [
  'Translate from English to Bengali: "Rates are competitive, almost always the best in the market"'
]

generations = generate_aya_23(prompts, loaded_model)

for p, g in zip(prompts, generations):
  print(
      "PROMPT", p ,"RESPONSE", g, "\n", sep="\n"
    )
