In [1]:
import transformers
import accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
tokenizer = AutoTokenizer.from_pretrained("E:\\cache\\hugging face\\llama3.2-1b")
model = AutoModelForCausalLM.from_pretrained("E:\\cache\\hugging face\\llama3.2-1b").to(device)

In [4]:
import pandas as pd
from datasets import load_dataset

data_path = "E:\\cache\\hugging face\\dataset\\train_en.json"
ds = load_dataset('json', data_files=data_path, split='train')
print(ds)

Dataset({
    features: ['id', 'cate', 'text', 'relation', 'entity'],
    num_rows: 186706
})


In [5]:
import numpy as np

num_samples = 500
seed = 42
rng = np.random.default_rng(seed)

shuffled_dataset = ds.shuffle(seed=rng)
new_ds = shuffled_dataset.select(range(num_samples))

print(new_ds)

Dataset({
    features: ['id', 'cate', 'text', 'relation', 'entity'],
    num_rows: 500
})


In [6]:
first_five = new_ds['text'][:5]
print(first_five)

['The Sun Also Rises is a 1957 American drama film adaptation of the 1926 Ernest Hemingway novel of the same name directed by Henry King. The screenplay was written by Peter Viertel and it starred Tyrone Power, Ava Gardner, Mel Ferrer, and Errol Flynn.  Much of it was filmed on location in France and Spain as well as Mexico in Cinemascope and color by Deluxe.  A highlight of the film is the famous "running of the bulls" in Pamplona, Spain and two bullfights.', 'The M606 is a 3-mile  stretch of motorway in West Yorkshire, England. Called the Bradford Spur motorway, the M606 leaves the M62 motorway at junction 26, near Cleckheaton, and heads into Bradford, to join the A6177 Bradford Ring Road. It is officially named the "Bradford South Radial Motorway" and was opened in 1973.', 'Wolfram syndrome, also called DIDMOAD (diabetes insipidus, diabetes mellitus, optic atrophy, and deafness), is a rare autosomal-recessive genetic disorder that causes childhood-onset diabetes mellitus, optic atro

In [7]:
model = model.eval()
tokenizer.pad_token = tokenizer.eos_token
input_text = "Wolfram syndrome, also called DIDMOAD"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)


generate_input = {
    "input_ids": input_ids,
    "max_new_tokens": 256,
    "do_sample": True,
    "top_k": 0,
    "top_p": 1,
    "temperature": 0.1,
    "repetition_penalty": 1.2,
    "eos_token_id": tokenizer.eos_token_id,
    "bos_token_id": tokenizer.bos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
}

outputs = model.generate(**generate_input)
result = tokenizer.decode(outputs[0])

print(result)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|begin_of_text|>Wolfram syndrome, also called DIDMOAD (Dysmorphic Dysplastic Ocular Anomalies), is a rare genetic disorder that affects the development of various body parts. It's characterized by multiple congenital anomalies and developmental delays.

**Causes:**

The exact cause of Wolfram syndrome remains unknown, but it's believed to be related to mutations in genes involved in cell signaling pathways. These abnormalities can lead to abnormal growth patterns during embryonic development.

**Symptoms:**

Individuals with Wolfram syndrome may experience:

* Congenital heart defects
* Cataracts or other eye problems
* Hearing loss or deafness
* Developmental delays or intellectual disability
* Seizures or epilepsy

**Diagnosis:**

A diagnosis of Wolfram syndrome typically involves a combination of clinical evaluation, imaging studies (e.g., MRI or CT scans), and laboratory tests to rule out other conditions. Genetic testing for specific gene mutations associated with the condition i

In [8]:
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments

peft_config = LoraConfig(
    r = 8,
    lora_alpha = 8,
    target_modules = ['gate_proj', 'down_proj', 'up_proj', 'q_proj', 'v_proj'], #(微调的目标模块）
    #target_modules = ['q_proj', 'v_proj'],
    lora_dropout = 0.1,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)

In [11]:
training_arguments = TrainingArguments(
    output_dir = "E:\\cache\\hugging face\\llama-tune",
    per_device_train_batch_size = 2,
    optim = 'adamw_torch',
    learning_rate = 2e-5,
    #eval_steps = 10,
    #save_steps = 200,
    logging_steps = 1000,
    #eval_strategy = 'steps',
    group_by_length = False,
    #max_steps = 2000,
    num_train_epochs = 100,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    bf16 = True,
    lr_scheduler_type = 'cosine',
    warmup_steps = 100
)

In [12]:
model.enable_input_require_grads()
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.config.use_cache = False

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689




In [13]:
tokenizer.pad_token_id = 0
tokenizer.padding_side = 'right'

In [14]:
from trl import SFTTrainer, SFTConfig


trainer = SFTTrainer(
    model = model,
    args = training_arguments,
    train_dataset = new_ds,
    #eval_dataset = test_data,
    peft_config = peft_config,
    max_seq_length = 1024,
    tokenizer = tokenizer,
    dataset_text_field = 'text',
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [15]:
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1000,2.3783
2000,2.2939
3000,2.2431
4000,2.2069
5000,2.1733
6000,2.1414
7000,2.1176
8000,2.0833
9000,2.0614
10000,2.0382


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

TrainOutput(global_step=25000, training_loss=2.031919990234375, metrics={'train_runtime': 29173.4449, 'train_samples_per_second': 1.714, 'train_steps_per_second': 0.857, 'total_flos': 5.988121105477632e+16, 'train_loss': 2.031919990234375, 'epoch': 100.0})

In [14]:
output_dir = "E:\\cache\\hugging face\\llama-tune1"
save_dir = f'{output_dir}'

trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Saved model to:", save_dir)

Saved model to: E:\cache\hugging face\llama-tune1
