<a href="https://colab.research.google.com/github/Excergic/Git_Tutorial/blob/main/Fine_Tuning_for_Acupuncture_Test_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
from datasets import Dataset

# Load your text file
with open("nano_acupuncture_data.txt", "r") as f:
    lines = f.readlines()

# Simple format: each line as a sample
data = {"text": [line.strip() for line in lines]}
dataset = Dataset.from_dict(data)

# Save as dataset
dataset.save_to_disk("acupuncture_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/169 [00:00<?, ? examples/s]

In [None]:
# Suppress API prompts
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_from_disk

# Load model and tokenizer
model_name = "deepseek-ai/deepseek-coder-1.3b-base"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load dataset
dataset = load_from_disk("/content/acupuncture_dataset")

# Tokenize data with labels for causal LM
def tokenize_function(examples):
    encodings = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    encodings["labels"] = encodings["input_ids"].copy()  # Labels are shifted input_ids
    return encodings

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])  # Drop text column

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, lora_config)

# Training args (GPU-optimized)
training_args = TrainingArguments(
    output_dir="/content/lora_finetuned_acupuncture",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    fp16=True,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune
trainer.train()

# Save
model.save_pretrained("/content/lora_finetuned_acupuncture")
tokenizer.save_pretrained("/content/lora_finetuned_acupuncture")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/793 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Map:   0%|          | 0/169 [00:00<?, ? examples/s]

Step,Training Loss
10,2.3776
20,0.7331
30,0.6
40,0.6723
50,0.5693
60,0.7054
70,0.6457
80,0.6035
90,0.3898
100,0.4611


('/content/lora_finetuned_acupuncture/tokenizer_config.json',
 '/content/lora_finetuned_acupuncture/special_tokens_map.json',
 '/content/lora_finetuned_acupuncture/tokenizer.json')