In [None]:
%%capture
# Normally using pip install unsloth is enough

# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft "trl<0.15.0" triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
import torch
import json

from trl import SFTTrainer
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, TextStreamer, DataCollatorForSeq2Seq
from unsloth.chat_templates import get_chat_template, standardize_sharegpt
from unsloth import FastLanguageModel, is_bfloat16_supported

In [2]:
# Load model
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Prepare model for PEFT
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    bias="none",
    random_state = 3407
)
print(model.print_trainable_parameters())


==((====))==  Unsloth 2025.2.4: Fast Llama patching. Transformers: 4.48.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.2.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 2.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039
None


In [3]:
# Load and format the custom JSON data
def load_custom_dataset(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Convert the data into the format expected by the model
    formatted_data = {
        "conversations": [
            [
                {
                    "role": "user",
                    "content": item["inputCode"]
                },
                {
                    "role": "assistant",
                    "content": item["outputText"]
                }
            ]
            for item in data["trainingData"]
        ]
    }

    # Create a Dataset object
    dataset = Dataset.from_dict(formatted_data)
    return dataset

In [None]:
# Load your custom dataset
json_path = "data/teso_train_data.json"
dataset = load_custom_dataset(json_path)

In [5]:
# Set up the tokenizer with the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize=False,
            add_generation_prompt=False
        )
        for convo in convos
    ]
    return {"text": texts}

In [6]:
dataset[0]

{'conversations': [{'content': 'import os\nfrom diagrams import Diagram, Cluster, Edge\nfrom diagrams.onprem.analytics import Spark\nfrom diagrams.onprem.compute import Server\nfrom diagrams.onprem.database import PostgreSQL\nfrom diagrams.onprem.inmemory import Redis\nfrom diagrams.onprem.aggregator import Fluentd\nfrom diagrams.onprem.monitoring import Grafana, Prometheus\nfrom diagrams.onprem.network import Nginx\nfrom diagrams.onprem.queue import Kafka\n\n# Create the Diagram instance with outformat="png"\ndiagram = Diagram(\n    "Advanced Web Services Open Source",\n    outformat="png",\n    show=not bool(os.environ.get(\'CI\', 0))\n)\n\nwith diagram:\n    nginx = Nginx("Nginx Ingress")\n\n    metrics = Prometheus("Promtheus metrics")\n    metrics << Edge(color="firebrick", style="dashed") << Grafana("Grafana monitoring")\n\n    with Cluster("Service Cluster"):\n        grpcsvc = [\n            Server("grpc1"),\n            Server("grpc2"),\n            Server("grpc3")\n        ]\

In [7]:
# Standardize and format the dataset
dataset = standardize_sharegpt(dataset)

dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=dataset.column_names
)

Standardizing format:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

In [8]:
dataset[0]

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nimport os\nfrom diagrams import Diagram, Cluster, Edge\nfrom diagrams.onprem.analytics import Spark\nfrom diagrams.onprem.compute import Server\nfrom diagrams.onprem.database import PostgreSQL\nfrom diagrams.onprem.inmemory import Redis\nfrom diagrams.onprem.aggregator import Fluentd\nfrom diagrams.onprem.monitoring import Grafana, Prometheus\nfrom diagrams.onprem.network import Nginx\nfrom diagrams.onprem.queue import Kafka\n\n# Create the Diagram instance with outformat="png"\ndiagram = Diagram(\n    "Advanced Web Services Open Source",\n    outformat="png",\n    show=not bool(os.environ.get(\'CI\', 0))\n)\n\nwith diagram:\n    nginx = Nginx("Nginx Ingress")\n\n    metrics = Prometheus("Promtheus metrics")\n    metrics << Edge(color="firebrick", style="dashed") << Grafana("Grafana monitoring")

In [9]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing=False,
    args=TrainingArguments(
        learning_rate=2e-5,
        lr_scheduler_type="linear",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        seed=3047,
        report_to="tensorboard",
    ),
)


Map (num_proc=2):   0%|          | 0/61 [00:00<?, ? examples/s]

In [10]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

In [11]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nimport os\nfrom diagrams import Diagram, Cluster, Edge\nfrom diagrams.onprem.analytics import Spark\nfrom diagrams.onprem.compute import Server\nfrom diagrams.onprem.database import PostgreSQL\nfrom diagrams.onprem.inmemory import Redis\nfrom diagrams.onprem.aggregator import Fluentd\nfrom diagrams.onprem.monitoring import Grafana, Prometheus\nfrom diagrams.onprem.network import Nginx\nfrom diagrams.onprem.queue import Kafka\n\n# Create the Diagram instance with outformat="png"\ndiagram = Diagram(\n    "Advanced Web Services Open Source",\n    outformat="png",\n    show=not bool(os.environ.get(\'CI\', 0))\n)\n\nwith diagram:\n    nginx = Nginx("Nginx Ingress")\n\n    metrics = Prometheus("Promtheus metrics")\n    metrics << Edge(color="firebrick", style="dashed") << Grafana("Grafana monitoring")\n\n    w

In [12]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]])

'                                                                                                                                                                                                                                                                                                                                                                                                                                                                             \n\nConsider switching to AWS ElastiCache or Azure Cache instead of Redis, if you want to reduce operational overhead or have outgrown a self-managed Redis cluster.\n Consider switching to Traefik or Envoy if you are already in a Kubernetes ecosystem, because you might need a solution with native auto-discovery, simpler TLS management, or advanced routing features.\n Consider switching from PostgreSQL Database HA to Amazon Aurora (PostgreSQL), Azure Database, or Google Cloud SQL, if you want to simplify high availability and scaling

In [13]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
1.09 GB of memory reserved.


In [14]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 61 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 24
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1,1.2085
2,1.2705
3,1.3172
4,1.3148
5,1.3597
6,0.9903
7,0.9034
8,0.8595
9,0.5805
10,0.3911




In [16]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

58.3132 seconds used for training.
0.97 minutes used for training.
Peak reserved memory = 3.463 GB.
Peak reserved memory for training = 2.373 GB.
Peak reserved memory % of max memory = 23.481 %.
Peak reserved memory for training % of max memory = 16.09 %.


In [17]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

# Optimize for inference
FastLanguageModel.for_inference(model)

# Define input prompt
messages = [
    {"role": "system", "content": "You are an expert in analyzing system architecture written using code.You check the architecture and provide clear and detailed explanations regarding how the architecture can improved for better performance, scalability, maintainability and cost effectiveness. You also check for possible cyber security issues and if the components can be replaced with newer and better components."},
    {"role": "user", "content": """from diagrams import Diagram, Cluster, Edge
from diagrams.aws.compute import Lambda
from diagrams.aws.database import Dynamodb
from diagrams.aws.analytics import Kinesis
from diagrams.aws.ml import ElasticCache
from diagrams.aws.compute import ECS
from diagrams.aws.integration import ApplicationLoadBalancer
from diagrams.aws.devtools import ECR
from diagrams.onprem.client import Users

with Diagram("AWS Streaming Data Processing", show=False, direction="LR"):
    user = Users("Data Ingestion")
    ecr = ECR("Amazon ECR")
    ecs = ECS("Amazon ECS")
    alb = ApplicationLoadBalancer("Application Load Balancer")
    kinesis_stream = Kinesis("Kinesis Stream")
    lambda_function = Lambda("Lambda Function")
    dynamodb = Dynamodb("Amazon DynamoDB")

    with Cluster("Data Updates"):
        elasticache = ElasticCache("ElastiCache Cluster")
        lambda_update = Lambda("Lambda Function")
        kinesis_update = Kinesis("Kinesis Stream")
        core_data_update = Users("Core Data Update")

    user >> alb >> ecs >> kinesis_stream >> lambda_function >> dynamodb
    ecs >> ecr
    elasticache >> lambda_update >> kinesis_update >> core_data_update
"""}
]

# Tokenize input with chat template
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must be True for generation
    return_tensors="pt",
).to("cuda")

# Initialize text streamer for real-time output
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

# Generate response
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=128,
    use_cache=True,
    temperature=1.5,
    min_p=0.1
)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


It seems like you want to analyze the system architecture of AWS and provide explanations on how to improve it, as well as on how to use the components.

Here's an explanation on how to improve it:

1. **Use AWS API Gateway**: Instead of using EC2 or EC3, use API Gateway to expose your APIs. This will improve performance as it will allow for the handling of API calls.
2. **Use Amazon Elastic Load Balancing (ELB)**: Instead of using ELB or ELB1, use ELB or ELB1. This will improve scalability as it will allow for the handling of multiple instances.



In [18]:
model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 15.55 out of 30.89 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 48.31it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [22]:
model.push_to_hub_merged("gokul-pv/Llama-3.2-1B-Instruct-16bit-TeSO", tokenizer, save_method="merged_16bit", token = "")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 15.57 out of 30.89 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 46.42it/s]


Unsloth: Saving tokenizer...

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.


README.md:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/gokul-pv/Llama-3.2-1B-Instruct-16bit-TeSO
