- Install the required packages

In [None]:
!pip install "pyarrow==14.0.2" "datasets==2.19.2" "transformers==4.56.2" "sagemaker==2.256.1" --quiet

print("Package installation completed.")

- Login to HuggingFace

In [None]:
#Use huggingface token to login. This will be used to download the model and datasets
#Token should have at least read permission
from huggingface_hub import login
login() 

- Get sagemaker session, role and default bucket

In [None]:
#Get sagemaker session
import sagemaker
import boto3

sess = sagemaker.Session()
default_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except:
    iam = boto3.client('iam')
    #update the role name as per role defined in IAM
    role = iam.get_role(RoleName='AmazonSageMaker-ExecutionRole-20260121T185431')['Role']['Arn']    


print(f"Sagemaker session {sess}")
print(f"Role: {role}")
print(f"Default bucket: {default_bucket}")

- Load the dataset. "TeleQnA" is a gated dataset, access should be obtained first.

In [None]:
from datasets import load_dataset, Dataset

#loading TeleQnA dataset
print("Loading netop/TeleQnA dataset...")
dataset_TeleQnA = load_dataset("netop/TeleQnA")

#Summarizing TeleQnA dataset
print("Summarizing TeleQnA dataset:\n")
print(f" Total splits: {len(dataset_TeleQnA)}")
print(f" Available splits: {list(dataset_TeleQnA.keys())}")
print(f" Number of total rows: {sum([dataset_TeleQnA[d].num_rows for d in dataset_TeleQnA])}")
print(f" dataset_TeleQnA structure: \n {dataset_TeleQnA}")

- Define reshape dataset function

In [None]:
def reshape_dataset(input_datadict):
  """This function reshape dataset into instruction format (instruction, input, output)
  1. Instruction: System prompt to guide model to act as domain expert
  2. Input: Questions are converted to input
  3. Output: Concatenated answer + explanation are converted to output.
  Resultant format of a record is as below:
      {
        "instruction": "You are a telecom domain expert. Answer the question with correct technical reasoning.",
        "input": "Why does low SINR reduce LTE throughput even when RSRP is good?",
        "output": "Low SINR indicates high interference or noise..."
      }
  """

  reshaped_dataset = []
  for row in input_datadict:
    message = {
        "instruction" : "You are a telecom domain expert. Answer the question with correct technical reasoning.",
        "input" : row['question'],
        "output" : row['choices'][row['answer']] + ". Explaination is: " + row['explaination']
    }
    reshaped_dataset.append(message)

  return reshaped_dataset

print("Function defined")

- Reshape dataset

In [None]:
#filter dataset to remove lexicon style Q&A
filtered_teleQnA = []
for row in dataset_TeleQnA['test']:
  if row['subject'] != 'Lexicon':
    filtered_teleQnA.append(row)

#reshape filtered dataset in the instruction format
reshaped_dataset = reshape_dataset(filtered_teleQnA)

#Total rows after reshaping
print(f"Total rows in reshaped dataset are: {len(reshaped_dataset)}")

#print a sample row
print(f"Instruction: {reshaped_dataset[0]['instruction']}")
print(f"Input: {reshaped_dataset[0]['input']}")
print(f"Output: {reshaped_dataset[0]['output']}")

- Tokenize the dataset. "meta-llama/Llama-2-7b-hf" is a gated model hence access should be obtained.

In [None]:
#Tokenize the dataset using model's tokenizer
from transformers import AutoTokenizer

#Get Llama2 tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

#This is needed when model doesn't have a default pad token.
#Safe practice is to check if tokenizer.pad_token_id is None
if tokenizer.pad_token_id is None:
  tokenizer.pad_token = tokenizer.eos_token

#specify the padding alignment
tokenizer.padding_side = "right"

#define a tokenize function to tokenize the dataset
#as data is reshaped into instruction format hence
#we need examples["instruction"] + examples["input"] + examples["output"]
def tokenize_function(examples):
  # When batched=True, 'examples' is a dictionary where each value is a list
  # We need to iterate through the batch and combine the corresponding strings
  texts = []  
  for i in range(len(examples["instruction"])): # Iterate over the items in the batch  
    text = (
            "### Instruction:\n"
            f"{examples['instruction'][i]}\n\n"
            "### Input:\n"
            f"{examples['input'][i]}\n\n"
            "### Response:\n"
            f"{examples['output'][i]}{tokenizer.eos_token}"
        )
    texts.append(text)
  tokens = tokenizer(texts, truncation=True, padding="max_length", max_length=1024)
  tokens["labels"] = tokens["input_ids"].copy()
  return tokens


#reshaped_dataset is a list hence converting it to huggingface dataset
input_dataset = Dataset.from_list(reshaped_dataset)
tokenized_dataset = input_dataset.map(tokenize_function, batched=True)
#tokenized_dataset_withoutcolumn = input_dataset.map(tokenize_function, remove_columns=input_dataset.features, batched=True)
print("Dataset is tokenized")

- Split the dataset into training and validation set. Ratio 80:20

In [None]:
#Shuffle and split the data into training/validation sets. 80:20 ratio.
#Shuffle 42 is tradition, not magic number

split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split["train"]
eval_dataset  = split["test"]
print("Datasets splitted")

- Upload the dataset to default S3 bucket

In [None]:
# s3 key prefix for the data
s3_prefix = 'fine-tune_llama2/datasets/teleqna'

# save train_dataset to s3
training_ds_path = f's3://{default_bucket}/{s3_prefix}/train'
train_dataset.save_to_disk(training_ds_path)

# save test_dataset to s3
eval_ds_path = f's3://{default_bucket}/{s3_prefix}/test'
eval_dataset.save_to_disk(eval_ds_path)

print(f"datasets are saved to following S3 path: \n {training_ds_path} \n {eval_ds_path}")

In [None]:
!pygmentize "./scripts/finetune_lora.py"

- Set training hyperparameters and create estimator

In [None]:
from sagemaker.huggingface import HuggingFace

#Set hyperparameters
hyperparameters = {
    "model-name" : 'meta-llama/Llama-2-7b-hf',    
    "batch-size" : 1,
    "learning-rate" : 5e-5,
    "epochs": 1,
    "max-steps" : 200,
    "eval-range" : 10
}

#initialize the sagemaker estimator

job_name = f'llama-7b-hf-finetune'  #Training Job Name 

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'finetune_lora.py',  # train script
    source_dir           = './scripts',         # directory which includes all the files needed for training
    instance_type        = 'ml.g5.2xlarge',     # instances type for training. g5.2xlarge, g5.4xlarge
    instance_count       = 1,                   # the number of instances used for training
    max_run              = 2*3600,              # maximum runtime in seconds 
    use_spot_instances   = True,                # to use spot instances for training  
    max_wait             = 3*3600,              # must be >= max_run, max wait time for instance
    base_job_name        = job_name,            # the name of the training job
    role                 = role,                # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,                 # the size of the EBS volume in GB
    transformers_version = '4.56.2',            # the transformers version used in the training job
    pytorch_version      = '2.8.0',             # the pytorch_version version used in the training job
    py_version           = 'py312',             # the python version used in the training job
    hyperparameters      =  hyperparameters,    # the hyperparameters passed to the training job
    environment          = {
                            "HUGGINGFACE_HUB_CACHE": "/tmp/.cache", # set env variable to cache model in /tmp
                            "HF_TOKEN": "<hugging face token>"    # huggingface token to access gated model
                            }, 
)

print("Huggingface estimator created.")



- Start the training

In [None]:
print("Starting training...")

# define a data input dictonary with uploaded s3 uris
data = {'training': training_ds_path, 'validation': eval_ds_path}
# starting the train job with uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

- After training is completed, print the S3 location where the model is uploaded

In [None]:
#Get the S3 URL where the model data is stored
print(huggingface_estimator.model_data)

- Deployment and Inference

In [None]:
#deploy the model to an endpoint and interactively test the model

#Using huggingface LLM Inference DLC inference Container
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.4.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

- Define the SageMaker model resource

In [None]:
#create a HuggingFaceModel using the container uri and the S3 path to our model

import json
from sagemaker.huggingface import HuggingFaceModel

#S3 path where the model is uploaded
model_s3_path = huggingface_estimator.model_data

#instance configuration
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  model_data=model_s3_path,
  env=config,
  name="llama2-custom-sft-model",  # AWS model name
)

print("Sagemaker model resources are defined.")

- Deploy the endpoint

In [None]:
#deploy the model to an endpoint
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to give SageMaker the time to download the model
)

print("Endpoint is deployed.")

Sample test with one record 

In [None]:
from transformers import AutoTokenizer
from sagemaker.s3 import S3Downloader
from datasets import load_from_disk

# Load the test dataset from s3. "." means it will download to current directory
S3Downloader.download(eval_ds_path, "./eval_data")

evaluation_dataset = load_from_disk("./eval_data")
random_sample = evaluation_dataset[120]

def build_prompt(sample):
  text = (
     "### Instruction: \n"
		 f"{sample['instruction']} \n\n"
     "### Input: \n"
     f"{sample['input']} \n\n"
     "### Response: \n"           
    )
  return text	

def request(sample):
    prompt = build_prompt(random_sample)
    outputs = llm.predict({
      "inputs": prompt,
      "parameters": {
        "max_new_tokens": 512,
        "do_sample": False,
        "return_full_text": False,        
        "repetition_penalty": 1.1          
      }
    })

    return {"role": "assistant", "content": outputs[0]["generated_text"].strip()}


print(f"Instruction: {random_sample["instruction"]}")
print(f"Input: {random_sample["input"]}")
print(f"Actual answer: {random_sample["output"]}")
print("#"*40)
print(f"Model's response: \n  {request(random_sample)}")

Delete the resources

In [None]:
#clean up 

llm.delete_model()
llm.delete_endpoint()