In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Installs


In [None]:
%pip install -U datasets transformers trl accelerate peft bitsandbytes

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting trl
  Downloading trl-0.12.2-py3-none-any.whl.metadata (11 kB)
Collecting accelerate
  Downloading accelerate-1.2.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting rich (from trl)
  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting markdown-it-py>=2.2.0 (from rich->trl)
  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich->trl)
  Downloading mdurl-0.1.2-py3-none-any.whl.meta

## Imports


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Model SetUp


In [None]:

model_name = "meta-llama/Llama-3.2-1B-Instruct"

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=bnb_config, device_map={"": 0})
model = prepare_model_for_kbit_training(model)

## Tokenizer


In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, add_eos_token=True)
# tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "left"
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

## Load Model


In [4]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_from_disk, load_dataset,DatasetDict
from transformers import TrainingArguments


sample_size_training = 100
val_dataset = load_from_disk("data/fake_targets/flores_devtest_arrow").select([i for i in range(sample_size_training)])
train_dataset = load_from_disk("data/fake_targets/NC_LUX.arrow").select_columns(["subsentence", "translated_text"]).rename_columns({
    "subsentence": "sentence_ltz_Latn",  # Renaming 'subsentence' to 'sentence_eng_Latn'
    "translated_text": "sentence_eng_Latn"  # Renaming 'translated_text' to 'sentence_ltz_Latn'
}).select([i for i in range(sample_size_training)])

# Convert datasets to dictionaries
dataset = DatasetDict({ 'train': train_dataset, 'val': val_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence_ltz_Latn', 'sentence_eng_Latn'],
        num_rows: 100
    })
    val: Dataset({
        features: ['sentence_eng_Latn', 'sentence_ltz_Latn'],
        num_rows: 100
    })
})

# LoRA Configuration


In [5]:
from peft import LoraConfig

peft_config = LoraConfig(
            lora_alpha=16, 
            lora_dropout=0.05,
            r=16,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules= ["down_proj","up_proj","gate_proj"]
)

In [6]:
# peft_config = LoraConfig(
#             lora_alpha=16,
#             lora_dropout=0.05,
#             r=64,
#             bias="none",
#             task_type="CAUSAL_LM",
#             target_modules= ["q_proj","up_proj","o_proj","k_proj","down_proj","gate_proj","v_proj"]
# )

# Training Hyperparameters


In [7]:
training_arguments = TrainingArguments(
        output_dir="results/",
        evaluation_strategy="steps",
        save_steps=100,
        log_level="debug",
        logging_steps=100,
        learning_rate=1e-4,
        eval_steps=100,
        fp16=True,
        do_eval=True,
        per_device_train_batch_size=48,
        per_device_eval_batch_size=48,
        gradient_accumulation_steps=2,
        warmup_steps=50,
        max_steps=500,
        lr_scheduler_type="linear"
)



In [8]:
test_dataset = load_dataset("musfiqdehan/preprocessed-BanglaNMT-sm")

In [9]:
test_dataset['train']['translations'][0]

'এবার খামটা দিন মি কব ###>Now the envelope Mr Cobb\n'

# Training with TRL


In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        peft_config=peft_config,
        dataset_text_field="translations",
        max_seq_length=48,
        tokenizer=tokenizer,
        args=training_arguments
)

trainer.train()

Map:   0%|          | 0/164084 [00:00<?, ? examples/s]

Map:   0%|          | 0/20511 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 48
***** Running training *****
  Num examples = 164,084
  Num Epochs = 1
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 2
  Total optimization steps = 500
  Number of trainable parameters = 23,199,744
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
100,1.8825,1.345326
200,1.2985,1.265929
300,1.2481,1.237133
400,1.2295,1.219609
500,1.2129,1.210467


***** Running Evaluation *****
  Num examples = 20511
  Batch size = 48
Saving model checkpoint to working/results/tmp-checkpoint-100
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8cca527612d856d7d32bd94f8103728d614eb852/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  

TrainOutput(global_step=500, training_loss=1.3743009338378906, metrics={'train_runtime': 23339.8701, 'train_samples_per_second': 2.057, 'train_steps_per_second': 0.021, 'total_flos': 9.166063140864e+16, 'train_loss': 1.3743009338378906, 'epoch': 0.29})

# Inference: Translate with Llama 2


## Base Model SetUp


In [None]:
base_model = "meta-llama/Llama-2-7b-hf"
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model, device_map={"": 0}, quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8cca527612d856d7d32bd94f8103728d614eb852/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8cca527612d856d7d32bd94f8103728d614eb852/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/8cca527612d856d7d32bd94f8103728d614eb852/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-

## Initialize Adapter (Fine-Tuned-Model)


In [None]:
# Fetched from Kaggle Output
model = PeftModel.from_pretrained(model, "working/results/checkpoint-500/")

In [None]:
# Uploaded to Hugging Face Model Hub
# model = PeftModel.from_pretrained(model, "musfiqdehan/Llama-2-7b-ft-mt-Bengali-to-English-sm")

# Testing Manually


In [None]:
my_text = "আমি স্কুলে যাচ্ছি ।"

prompt = my_text+" ###>"

tokenized_input = tokenizer(prompt, return_tensors="pt")
input_ids = tokenized_input["input_ids"].cuda()

generation_output = model.generate(
        input_ids=input_ids,
        num_beams=6,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=130
)
for seq in generation_output.sequences:
    output = tokenizer.decode(seq, skip_special_tokens=True)
    print(output.split("###>")[1].strip()) 

I am going to school
