In [1]:
!pip install -U -q accelerate
!pip install -U -q peft
!pip install -U -q bitsandbytes
!pip install -U -q evaluate

In [2]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

In [3]:
import os
import json
from datasets import load_dataset

os.environ['DATASET_NAME'] = 'aisuko/diverse_calculation'
os.environ['MODEL_NAME'] = 'microsoft/Phi-3.5-mini-instruct'
os.environ['FT_MODEL_NAME'] = 'aisuko/calculator'

ds=load_dataset(os.getenv('DATASET_NAME'))
ds

README.md:   0%|          | 0.00/609 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/228k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['output', 'structured_input', 'direct_input', 'conversational_input'],
        num_rows: 5000
    })
})

In [4]:
# pre-processing the dataset

import json

def preprocess_function(example):
    return {
        'input_text': example['structured_input'],
        'target_text': example['output']
    }

# Apply the preprocessing function to the dataset
processed_dataset = ds.map(preprocess_function)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [5]:
structured_dataset=processed_dataset.remove_columns(['output','structured_input','direct_input','conversational_input'])

In [6]:
structured_dataset['train'][0]

{'input_text': '{"A":74,"op":"*","B":70}', 'target_text': '{"result": "5180"}'}

# Tokenize the Data

In [7]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [8]:
input_string = '{"A":74,"op":"*","B":70}'

# Tokenize the input string
encoded_input = tokenizer.encode(input_string, add_special_tokens=True)

token_length = len(encoded_input)
print(f'The input string is tokenized into {token_length} tokens.')

The input string is tokenized into 15 tokens.


In [9]:
def tokenize_function(example):
    inputs = tokenizer(example['input_text'], padding='max_length', truncation=True, max_length=50)
    targets = tokenizer(example['target_text'], padding='max_length', truncation=True,max_length=50)
    inputs['labels'] = targets['input_ids']
    return inputs

# Apply the tokenization function
tokenized_dataset = structured_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

# Spliting dataset to 80%-20%

In [10]:
# Split the dataset: 80% for training, 20% for testing
split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.2, seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

## Fit lower Computing Resources

In [11]:
from datasets import DatasetDict

pre_processed_ds_train_low=split_dataset['train'].shuffle(seed=42).select(range(400))
pre_processed_ds_test_low=split_dataset['test'].shuffle(seed=42).select(range(100))


ds_low=DatasetDict({
    'train': pre_processed_ds_train_low,
    'test': pre_processed_ds_test_low
})
ds_low

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 400
    })
    test: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

# Load the model

In [12]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig

bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True,
)

# Load the pre-trained model
model = AutoModelForCausalLM.from_pretrained(
    os.getenv('MODEL_NAME'),
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True, # for the Phi family
)
model

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_fe

In [13]:
from peft import prepare_model_for_kbit_training

prepared_model=prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=True
)

In [14]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config=LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["lm_head","embed_tokens"], # we added new tokens to tokenizer, this is necesarry
    task_type=TaskType.CAUSAL_LM
)

lora_model=get_peft_model(prepared_model, lora_config)
lora_model.config.use_cache=False

# Batch the data

In [15]:
from transformers import DataCollatorWithPadding

data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
import numpy as np
# from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, hamming_loss

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     # Convert logits to predicted class labels
#     predictions = np.argmax(logits, axis=-1)
    
#     # Initialize lists to store metric values for each output
#     precision_list = []
#     recall_list = []
#     f1_list = []
#     accuracies = []
#     hamming_losses = []
    
#     # Calculate metrics for each output
#     for i in range(labels.shape[1]):
#         precision = precision_score(labels[:, i], predictions[:, i], average='macro', zero_division=1)
#         recall = recall_score(labels[:, i], predictions[:, i], average='macro', zero_division=1)
#         f1 = f1_score(labels[:, i], predictions[:, i], average='macro', zero_division=1)
#         acc = accuracy_score(labels[:,i], predictions[:,i])
#         ham_loss = hamming_loss(labels[:, i], predictions[:, i])
        
#         precision_list.append(precision)
#         recall_list.append(recall)
#         f1_list.append(f1)
#         accuracies.append(acc)
#         hamming_losses.append(ham_loss)
    
#     # Calculate mean metrics across all outputs
#     mean_precision = np.mean(precision_list)
#     mean_recall = np.mean(recall_list)
#     mean_f1 = np.mean(f1_list)
#     mean_accuracy = np.mean(accuracies)
#     mean_hamming_loss = np.mean(hamming_losses)
    
#     return {
#         'mean_precision': mean_precision,
#         'mean_recall': mean_recall,
#         'mean_f1': mean_f1,
#         'mean_accuracy': mean_accuracy,
#         'mean_hamming_loss': mean_hamming_loss,
#     }


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to predicted class labels
    predictions = np.argmax(logits, axis=-1)

    # Calculate Exact Match Ratio
    exact_matches = np.all(predictions == labels, axis=1)
    exact_match_ratio = np.mean(exact_matches)
    
    # Calculate Sequence-Level Accuracy
    sequence_accuracy = np.mean([np.array_equal(p, l) for p, l in zip(predictions, labels)])

    return {
        'exact_match_ratio': exact_match_ratio,
        'sequence_accuracy': sequence_accuracy,
    }

In [17]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=os.getenv('FT_MODEL_NAME'),
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="tensorboard",
)

trainer = Trainer(
    model=lora_model,
    args=training_args,  
    # train_dataset=split_dataset["train"],
    # eval_dataset=split_dataset["test"],
    train_dataset=ds_low['train'],
    eval_dataset=ds_low['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics # I have not implemented it
)

trainer.train()

  trainer = Trainer(
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Exact Match Ratio,Sequence Accuracy
1,11.7855,8.903116,0.0,0.0
2,6.224,3.551179,0.0,0.0
3,2.4789,1.957049,0.0,0.0


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=75, training_loss=6.8294610595703125, metrics={'train_runtime': 621.2348, 'train_samples_per_second': 1.932, 'train_steps_per_second': 0.121, 'total_flos': 1384648335360000.0, 'train_loss': 6.8294610595703125, 'epoch': 3.0})

# Push to HF

In [18]:
kwargs={
    'model_name': os.getenv('DATASET_NAME')+'_structured',
    'finetuned_from': 'microsoft/Phi-3.5-mini-instruct',
    'tasks': 'Text-Generation',
    'dataset': os.getenv('DATASET_NAME')
}

tokenizer.push_to_hub(os.getenv('DATASET_NAME')+'_structured')

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/aisuko/diverse_calculation_structured/commit/909135a8b00503b2a32d176b7313a48bbf063e3e', commit_message='Upload tokenizer', commit_description='', oid='909135a8b00503b2a32d176b7313a48bbf063e3e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aisuko/diverse_calculation_structured', endpoint='https://huggingface.co', repo_type='model', repo_id='aisuko/diverse_calculation_structured'), pr_revision=None, pr_num=None)

In [19]:
trainer.push_to_hub('**kwargs')

adapter_model.safetensors:   0%|          | 0.00/889M [00:00<?, ?B/s]

events.out.tfevents.1733470487.45a68344396a.23.0:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/aisuko/calculator/commit/6737ce82b4c58f43fe76f78b010bc0dfc9addb7d', commit_message='**kwargs', commit_description='', oid='6737ce82b4c58f43fe76f78b010bc0dfc9addb7d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aisuko/calculator', endpoint='https://huggingface.co', repo_type='model', repo_id='aisuko/calculator'), pr_revision=None, pr_num=None)

In [20]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.957048773765564, 'eval_exact_match_ratio': 0.0, 'eval_sequence_accuracy': 0.0, 'eval_runtime': 16.4779, 'eval_samples_per_second': 6.069, 'eval_steps_per_second': 0.425, 'epoch': 3.0}
