In [1]:
!pip install sentencepiece
!pip install -q -U bitsandbytes
!pip install transformers
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install datasets
!pip install einops

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import transformers
from datasets import load_dataset

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/m2m100_1.2B", torch_dtype="auto", device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_1.2B", padding_side='left')

In [4]:
train_dataset = load_dataset(path='findnitai/english-to-hinglish', split='train')

In [5]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

def generate_and_tokenize_prompt(example):
    text = f"{example['translation']['en']}" + tokenizer.eos_token
    model_inputs = tokenizer(text, max_length=256, truncation=True, padding=True)

    hien_text = f"{example['translation']['hi_ng']}" + tokenizer.eos_token
    labels = tokenizer(hien_text, max_length=256, truncation=True, padding=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [6]:
tokenized_dataset = train_dataset.map(generate_and_tokenize_prompt)

In [7]:
# eval_prompt = tokenizer.bos_token + ''''en_XX
#                   ### en:
#                   \'Who stars in the movie?\'

#                   ### hi_ng:
#                   ''' + tokenizer.pad_token
input_text = "What is the name of the movie"
eval_prompt = tokenizer.bos_token + input_text + tokenizer.eos_token
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")


model.eval()
with torch.inference_mode():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

O que é o nome do filme


In [8]:
model

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-23): 24 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm

In [9]:
# from peft import prepare_model_for_kbit_training

# model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)

In [10]:
# def print_trainable_parameters(model):
#     """
#     Prints the number of trainable parameters in the model.
#     """
#     trainable_params = 0
#     all_param = 0
#     for _, param in model.named_parameters():
#         all_param += param.numel()
#         if param.requires_grad:
#             trainable_params += param.numel()
#     print(
#         f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
#     )

In [11]:
# from peft import LoraConfig, get_peft_model

# config = LoraConfig(
#     r=4,
#     lora_alpha=16,
#     target_modules=['k_proj', 'q_proj', 'v_proj', 'out_proj','fc1','fc2'],
#     lora_dropout=0.01,
#     bias="none",
#     task_type="SEQ2SEQ_LM"
# )

# model = get_peft_model(model, config)
# print_trainable_parameters(model)

In [12]:
data_collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
training_args = transformers.Seq2SeqTrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=400,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=50,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    )
model.train()

trainer = transformers.Seq2SeqTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForSeq2Seq(tokenizer),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss
50,2.885
100,1.7384
150,1.2845
200,1.242
250,1.1848
300,1.0798
350,0.9677
400,0.961


TrainOutput(global_step=400, training_loss=1.4178974533081055, metrics={'train_runtime': 701.281, 'train_samples_per_second': 9.126, 'train_steps_per_second': 0.57, 'total_flos': 835654543835136.0, 'train_loss': 1.4178974533081055, 'epoch': 0.03})

In [14]:
training_args = transformers.Seq2SeqTrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=150,
        learning_rate=1e-5,
        fp16=True,
        logging_steps=50,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    )
model.train()

trainer = transformers.Seq2SeqTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForSeq2Seq(tokenizer),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss
50,0.5608
100,0.5156
150,0.3561


TrainOutput(global_step=150, training_loss=0.4775059509277344, metrics={'train_runtime': 261.024, 'train_samples_per_second': 9.195, 'train_steps_per_second': 0.575, 'total_flos': 311498563289088.0, 'train_loss': 0.4775059509277344, 'epoch': 0.01})

In [None]:
model.eval()

save_directory = '/content/drive/MyDrive/my_model'

# Save the model and tokenizer to the specified directory
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)