In [2]:
!pip install torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate einops


Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.4.7
  Downloading trl-0.4.7-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.0-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31

In [3]:
!pip install tqdm scipy




In [4]:
import os
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer

In [5]:
from huggingface_hub import interpreter_login


In [6]:
interpreter_login()



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: ··········
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
dataset = load_dataset("BevenRozario/job_desc_5k", split="train_dataset")


In [10]:
import pandas as pd
df = pd.DataFrame(dataset)

Unnamed: 0,Instruction,Response
0,Generate a job description for a Social Media ...,Job Description: Social Media Manager (Pharmac...
1,Generate a job description for a Software Engi...,Job Description: Software Engineer (Retail - H...


In [15]:
# Function to transform the row into desired format
def format_row(row):
    question = row['Instruction']
    answer = row['Response']
    formatted_string = f"[INST] {question} [/INST] {answer} "
    return formatted_string

# Apply the function to each row of the dataframe
df['Text'] = df.apply(format_row, axis=1)

# Display the formatted column
df['Text']
df1 = df['Text']

In [17]:
df1.to_csv('/content/drive/MyDrive/Datasets/text_formatted_data.csv', index=False)

In [18]:
final_df = pd.read_csv("/content/drive/MyDrive/Datasets/text_formatted_data.csv")

In [19]:
final_df.head()

Unnamed: 0,Text
0,[INST] Generate a job description for a Social...
1,[INST] Generate a job description for a Softwa...
2,[INST] Generate a job description for a Qualit...
3,[INST] Generate a job description for a Web De...
4,[INST] Generate a job description for a Data A...


In [21]:
training_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/Datasets/text_formatted_data.csv", split="train")


Generating train split: 0 examples [00:00, ? examples/s]

In [25]:
base_model = "microsoft/phi-2"
new_model = "phi-2-job-desp"

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    # use_flash_attention_2=True, # Phi does not support yet.
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

training_arguments = TrainingArguments(
    output_dir="./jdPhi",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=50,
    optim="paged_adamw_8bit",
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    save_steps=500,
    warmup_ratio=0.05,
    weight_decay=0.01,
    max_steps=-1,
    push_to_hub=True,  # Add this line to push the model to the hub,
    hub_model_id="BevenRozario/Phi2_JobDesp_5K"
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=4,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ], - 41M params
    # modules_to_save=["embed_tokens","lm_head"]
)

trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    peft_config=peft_config,
    dataset_text_field="Text",
    max_seq_length=600,
    tokenizer=tokenizer,
    args=training_arguments,
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
trainer.push_to_hub()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/BevenRozario/Phi2_JobDesp_5K into local empty directory.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Upload file adapter_model.bin:   0%|          | 1.00/70.1M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/4.30k [00:00<?, ?B/s]

Upload file runs/Feb12_17-21-26_1acb3246534f/events.out.tfevents.1707758517.1acb3246534f.468.0:   0%|         …

To https://huggingface.co/BevenRozario/Phi2_JobDesp_5K
   4875181..9c27016  main -> main

   4875181..9c27016  main -> main

To https://huggingface.co/BevenRozario/Phi2_JobDesp_5K
   9c27016..af2eb55  main -> main

   9c27016..af2eb55  main -> main



'https://huggingface.co/BevenRozario/Phi2_JobDesp_5K/commit/9c27016b8e5cca5377f051fecc17abf7d5f6d077'

In [26]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "BevenRozario/Phi2_JobDesp_5K")
# ft_model = PeftModel.from_pretrained(base_model, "BevenRozario/mistral_v3_5k", use_rslora=False)

adapter_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

AttributeError: 'str' object has no attribute 'forward'

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = ""
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

In [27]:
from transformers import pipeline


In [29]:
prompt = "Generate a Job Description for a Data Scientist"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=5000)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


[INST] Generate a Job Description for a Data Scientist [/INST]

[INST] Create a Job Description for a Data Scientist [/INST]

[INST] Design a Job Description for a Data Scientist [/INST]

[INST] Develop a Job Description for a Data Scientist [/INST]

[INST] Construct a Job Description for a Data Scientist [/INST]

[INST] Produce a Job Description for a Data Scientist [/INST]

[INST] Craft a Job Description for a Data Scientist [/INST]

[INST] Formulate a Job Description for a Data Scientist [/INST]

[INST] Create a Job Description for a Data Scientist [/INST]

[INST] Develop a Job Description for a Data Scientist [/INST]

[INST] Design a Job Description for a Data Scientist [/INST]

[INST] Construct a Job Description for a Data Scientist [/INST]

[INST] Produce a Job Description for a Data Scientist [/INST]

[INST] Craft a Job Description for a Data Scientist [/INST]

[INST] Formulate a Job Description for a Data Scientist [/INST]

[INST] Create a Job Description for a Data Scientist [