# **Connect the runtime to the T4 GPU before running the Model's cells**

In [None]:
#model dependencies
!pip install --upgrade torch
!pip install --upgrade transformers==4.41.3 #issue fix attempt with importing AutoModelForCasualLM module from transformers
!pip install --upgrade datasets
!pip install --upgrade huggingface_hub
!pip install --upgrade accelerate
!pip install --upgrade bitsandbytes #for quantization
!pip install --upgrade peft #for low rank adapter
!pip install --upgrade trl #for SFT training
!pip install --upgrade colored

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# **Logging in**

In [None]:
#getting hf tokens to perform operations with their API
from google.colab import userdata
readToken = userdata.get('readToken')
writeToken = userdata.get('writeToken')

from huggingface_hub import login

login(readToken)

# **Loader**

In [None]:
from datasets import get_dataset_split_names

def splitLoader(Repos, Split="train"): #loads specific splits (train split by default)... will be used in model training and testing
  Names = []
  dfs = []
  for dataset in Repos:
    try:
      dfs.append(load_dataset(dataset, split=Split))
      Names.append(i)
    except Exception as e:
      print(f"Exception received: {e}")
      inp = input("Enter a different split: ")
      availableSplits = get_dataset_split_names(dataset)
      splitLen = len(availableSplits)
      c = 0
      while (inp.strip() != "") and (inp in availableSplits) and (c < splitLen):
        dfs.append(load_dataset(dataset, split=inp))
        Names.append(dataset + "_" + inp)
        inp = input("Enter another split for the same dataset [do not enter anything otherwise]: ")
        c += 1
  return Names, dfs


# **All imports necessary**

In [None]:
import random
from textwrap import dedent
from typing import Dict, List
import matplotlib as mlp
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from colored import Back, Fore, Style
from datasets import Dataset, load_dataset
from matplotlib.ticker import PercentFormatter

from peft import (
    LoraConfig, PeftModel, TaskType, get_peft_model, prepare_model_for_kbit_training
    )

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm

from transformers import (
     AutoTokenizer, BitsAndBytesConfig, pipeline
)
from transformers import AutoModelForCausalLM
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer


In [None]:
#setting up important globals

def seedAll(seed): #seeding all dependency RNGs
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

seedAll(47)

PAD_TOKEN = "<|pad|>" #the token used for padding to ensure tensor sizes that are independent
#of string length variability

MODEL_NAME = "meta-llama/Llama-3.2-3B" #model path on hf
NEW_MODEL = "Llama-3.2-3B-Humurous-STEM-Tutor" #trained model's name


In [None]:

print(torch.cuda.is_available())
device = torch.cuda.get_device_name(0)
print(device)

True
Tesla T4


# **Model Configs**

In [None]:
#configure the model to store its parameters in 4-bit representation using normalized float 4 (nf4)
#and when it actively uses the parameters in training/inference it extends them to 16-bit floats
quantizationConfig = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16,
    torch_dtype = torch.float16 #using float16 dtype instead of bfloat16 because it's natively supported on Tesla T4 GPUs; bfloat is more preferable on TPUs and higher GPUs like A100; but not supported on the T4
)
#use original meta tokenizer for the Llama 3.2 3B model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

#adding pad token to the tokenizer
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.padding_side = "right" #refines model output and controls the issue of generated text repitition

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, quantization_config = quantizationConfig,
    #attn_implementation = "flash_attention_2",
    #attn_implementation = "sdpa", #different attention implementations for the transformer
    device_map = "auto", #defaults to the GPU if available
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"


Checking important sequence tokens in the tokenizer

In [None]:
print(tokenizer.bos_token, tokenizer.bos_token_id, tokenizer.eos_token, tokenizer.eos_token_id, tokenizer.pad_token, tokenizer.pad_token_id)
tokenizer.convert_tokens_to_ids(PAD_TOKEN)

<|begin_of_text|> 128000 <|end_of_text|> 128001 <|pad|> 128256


128256

In [None]:
dataset = load_dataset("O047/prepSIFT_Code1") #testing the trainnig pipeline and improvement on preprocessed magicoder-evol-instruct

README.md:   0%|          | 0.00/326 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/136M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/111183 [00:00<?, ? examples/s]

# **Converting to a pandas dataframe for training**

In [None]:
#enables more powerful preprocessing

def convertDatasetToDataframe(dataset, split="train"):
  datasetColumns = dataset[split].features.keys()
  if not datasetColumns:
    raise Exception("dataset must have at least 1 feature.")

  rows = []
  for row in dataset[split]:
    rows.append({column : row[column] for column in datasetColumns})
  return pd.DataFrame(rows)

df = convertDatasetToDataframe(dataset)

df.head()

Unnamed: 0,instruction,response
0,Please amend the subsequent Python script so t...,```python\n# Establish an integer list\narr = ...
1,"i've got this python code from an ocr tool, bu...",There are several issues with your code like f...
2,Create a recursive function in Java that predi...,You can achieve this using a combination of re...
3,Develop a program that uses natural language p...,This task requires writing of a significant vo...
4,I am trying to create a load testing script fo...,The correct approach would depend on the preci...


In [None]:
df.isnull().sum().sum().item() #clean

0

# **RAG preprocessor (dataset augmentation)**

In [None]:
#this feeds the model with input examples that are formatted in a chatbot prompt-like way
#this has context integration formatter and one for datasets without contexts - but these are very specific and depend on the exact context
#hence they are very liable to experimentation and changes
#any prompts that are forwarded to the model should ideally be formatted here first too (used in both trainng and testing)
def format_example_withCTX(row: dict):
  if(tokenizer.chat_template is None):
    tokenizer.chat_template = """<s>[INST] <<SYS>>
{system_message}
<</SYS>>

{user_message} [/INST] {assistant_message}</s>
"""
  prompt = dedent( #removes indentations from multi-line strings (used here to process back the string which is coded in multiple lines for clarity)
      f"""
      {row['instruction']}

      Information:

      ```
      {row['context']}
      ```
      """
  )
  messages = [
      {
          "role": "system",
          "content": "You are a retrieval-based assistant. Only use the provided information to answer the question. Do not add any extra details or assumptions. If the information is unclear or incomplete, say so explicitly.",
      },
      {"role": "user", "content": prompt},
      {"role": "assistant", "content": row["response"]}
  ]
  return tokenizer.apply_chat_template(messages, tokenize=False)

def format_example(row: dict):

  prompt = row["instruction"]
  messages = [
      {
          "role": "system",
          "content": "You are a retrieval-based assistant. Use only the informtation to answer the question.",
      },
      {"role": "user", "content": prompt},
      {"role": "assistant", "content": row["response"]}
  ]

  if(tokenizer.chat_template is None):
    tokenizer.chat_template = tokenizer.apply_chat_template(messages, tokenize=False)
  temp = tokenizer.chat_template
  return temp

In [None]:
df["text"] = df.apply(format_example, axis=1)
df.head()

Unnamed: 0,instruction,response,text
0,Please amend the subsequent Python script so t...,```python\n# Establish an integer list\narr = ...,<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\...
1,"i've got this python code from an ocr tool, bu...",There are several issues with your code like f...,<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\...
2,Create a recursive function in Java that predi...,You can achieve this using a combination of re...,<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\...
3,Develop a program that uses natural language p...,This task requires writing of a significant vo...,<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\...
4,I am trying to create a load testing script fo...,The correct approach would depend on the preci...,<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\...


In [None]:

model_id = "meta-llama/Llama-3.2-3B"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
pipe.model = torch.compile(pipe.model)

pipe.tokenizer.pad_token_id = pipe.tokenizer.eos_token_id

pipe("generate cpp code that prints hello world")

In [None]:
#some pipe tests:
pipe("Hello")[0]

In [None]:
resp = pipe("ye boi")
print(type(resp))
print(type(resp[0]))
print(len(resp))
print(resp)

In [None]:
prompt = "explain matrices like I am 5"
response = pipe(prompt,
     max_new_tokens=50,
     do_sample=True,
     temperature=0.7,
     top_p=0.9,
     repetition_penalty=1.1,
     eos_token_id=128001
)[0]["generated_text"][len(prompt):].strip()
print(response)