## **Installing dependencies** 

In [18]:
import torch
torch.cuda.empty_cache()

In [2]:
# we use the latest version of transformers, peft, and accelerate
!pip install -q accelerate peft transformers

# install bitsandbytes for quantization
!pip install -q bitsandbytes

# install trl for the SFT library
!pip install -q trl

# we need sentencepiece for the llama2 slow tokenizer (not necessary for mistral)
!pip install sentencepiece

# we need einops, used by falcon-7b, llama-2 mistral etc
# einops (einsteinops) is used to simplify tensorops by making them readable
!pip install -q -U einops

# we need to install datasets for our training dataset
!pip install -q datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[

Keeping track on GPU resources

In [1]:
import torch

def print_gpu_memory_usage():
    # Assuming you have at least one CUDA-capable GPU and want to check the first one
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0) 
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved

    print(f"Total GPU Memory: {t * 1e-9:.2f} GB")
    print(f"Reserved Memory: {r * 1e-9:.2f} GB")
    print(f"Allocated Memory: {a * 1e-9:.2f} GB")
    print(f"Free (inside reserved) Memory: {f * 1e-9:.2f} GB")

# Then, you can call this function at the end of each cell where you want to check the GPU memory usage
print_gpu_memory_usage()


Total GPU Memory: 34.07 GB
Reserved Memory: 0.00 GB
Allocated Memory: 0.00 GB
Free (inside reserved) Memory: 0.00 GB


## **Loading the model** 

In [1]:
# The model that we want to train from the Hugging Face hub
model_name = "codellama/CodeLlama-7b-hf"

# The instruction dataset to use found on HuggingFace
dataset_name = "MaamarM/SAS_training"

# Fine-tuned model name
new_model = "Mistral-7B-SAS"

# Output directory where the model predictions and checkpoints will be stored
#This is to push our fine tuned model in the Hugging Face platform
output_dir = "./Mistral_Instruct_SAS"

# Number of training epochs
num_train_epochs = 8

In [2]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    logging,
)

# load the quantized settings, we're doing 4 bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # use the gpu
    device_map={"": 0}
)

# don't use the cache
model.config.use_cache = False

# Load the tokenizer from the model (mistral)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

2024-04-11 16:42:52.305819: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-11 16:42:52.368940: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## **Testing on some prompts** 

In [15]:
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

parquet_file = r'test_data.parquet'
df=pd.read_parquet(parquet_file, engine = 'pyarrow')
df

Unnamed: 0,content,Annotation
4829,*SAS code;\nPROC REG DATA= REG_SERIES; MODEL y...,Perform a linear regression analysis on the da...
5081,%let name=sct4;\n\n/* \nSet your current-worki...,"Set a variable named ""name"" to ""sct4"", create ..."
5011,options bufsize=32768 pagesize=10000;\n\ndata ...,"Define the options ""bufsize"" and ""pagesize"" to..."
4625,"LIBNAME X ""/folders/myfolders/X"";\ndata brfss_...","""Set the library location to '/folders/myfolde..."
3838,*Ex15_macro_part_of_SAS_statement.sas;\noption...,Please convert the following SAS programming l...
...,...,...
3819,Parent child kinship problem chain\n\n WPS an...,"Create an intermediate dataset named ""int"" by ..."
10089,/*********************************************...,Create an LLM model that can replicate the fun...
8740,Cross Platform Unzip Medicare Point of Sevice ...,Instructions for an LLM model to match the giv...
7831,"%include ""HIDI_init.sas"" ; /* File path remove...","""Include the following SAS files in the order ..."


In [29]:
results = []
def generate_script(instruction):
    result = pipe(f"[INST] {instruction} [/INST]")
    generated_text = result[0]['generated_text']
    cleaned_output = generated_text.replace(f"[INST] {instruction} [/INST]", "").strip()
    return instruction, cleaned_output

# Use ThreadPoolExecutor to parallelize script generation
with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers based on your hardware
    future_to_inst = {executor.submit(generate_script, inst): inst for inst in df['Annotation'].iloc[:100]}
    for future in tqdm(as_completed(future_to_inst), total=len(future_to_inst), desc="Generating Scripts"):
        instruction, script = future.result()
        results.append({"instruction": instruction, "generated_script": script})

# Convert results to DataFrame and save to Parquet
df_results = pd.DataFrame(results)
df_results.to_parquet('generated_scripts.parquet')

Generating Scripts: 100%|██████████| 100/100 [2:57:47<00:00, 106.68s/it] 


In [30]:
parquet_file = r'generated_scripts.parquet'
df=pd.read_parquet(parquet_file, engine = 'pyarrow')
df

Unnamed: 0,instruction,generated_script
0,"""Set the library location to '/folders/myfolde...","To set the SAS library location, use the follo..."
1,Please convert the following SAS programming l...,To create an LLM model instruction for the giv...
2,Rename a file located in a specific path using...,To rename the files using the given file names...
3,Perform a linear regression analysis on the da...,"To perform a linear regression analysis, save ..."
4,"Set a variable named ""name"" to ""sct4"", create ...","```scss:scss\n\nname <- ""sct4""\nfile.edit(""."",..."
...,...,...
95,"Set the library named ""es"" to the directory ""/...","To accomplish the tasks you described, you can..."
96,Create SAS informats for missing values using ...,To create SAS informats for missing values usi...
97,Create an LLM model that can replicate the fun...,"To create an LLM model named ""lscmd"" version 1..."
98,Create a function that creates an Application ...,Here's a VBA function that creates an Applicat...


In [32]:
df['generated_script'].iloc[0]

'To set the SAS library location, use the following command:\n\n```sas\nOPTIONS NOXWAIT NOXSYNC;\nLIBNAME lib "folders/myfolders/X" SERVER;\n```\n\nReplace "X" with the name of your library.\n\nNow, load the dataset \'Chap6_1\' from the library \'X\' into a new dataset named \'brfss_a\':\n\n```sas\nDATA brfss_a;\n    SET lib.Chap6_1;\nRUN;\n```\n\nFinally, use the \'PROC CONTENTS\' procedure to display variable information for the dataset \'brfss_a\':\n\n```sas\nPROC CONTENTS DATA=brfss_a;\nRUN;\n```\n\nThis will display the variable names and their corresponding variable numbers in the SAS log.'

## **Fine tuning sur SAS**

In [None]:
from datasets import load_dataset

data_files = {"train":"formatted_training.jsonl"}

# Load the dataset
dataset = load_dataset(dataset_name, data_files = data_files , split="train[:50%]")

In [8]:
print_gpu_memory_usage()

Total GPU Memory: 34.07 GB
Reserved Memory: 10.23 GB
Allocated Memory: 10.04 GB
Free (inside reserved) Memory: 0.19 GB


In [7]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"


In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,      # uses the number of epochs earlier
    per_device_train_batch_size=1,          # 4 seems reasonable
    gradient_accumulation_steps=1,          # 2 is fine, as we're a small batch
    optim="paged_adamw_32bit",              # default optimizer
    save_steps=0,                           # we're not gonna save
    logging_steps=10,                       # same value as used by Meta
    learning_rate=2e-4,                     # standard learning rate
    weight_decay=0.001,                     # standard weight decay 0.001
    fp16=False,                             # set to true for A100
    bf16=False,                             # set to true for A100
    max_grad_norm=0.3,                      # standard setting
    max_steps=-1,                           # needs to be -1, otherwise overrides epochs
    warmup_ratio=0.03,                      # standard warmup ratio
    group_by_length=True,                   # speeds up the training
    lr_scheduler_type="cosine",           # constant seems better than cosine
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,                # use our lora peft config
    dataset_text_field="text",
    max_seq_length=None,                    # no max sequence length
    tokenizer=tokenizer,                    # use the llama tokenizer
    args=training_arguments,                # use the training arguments
    packing=False,                          # don't need packing
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)



{'loss': 1.0357, 'grad_norm': 0.032106101512908936, 'learning_rate': 7.633587786259543e-06, 'epoch': 0.01}
{'loss': 0.9167, 'grad_norm': 0.04357542842626572, 'learning_rate': 1.5267175572519086e-05, 'epoch': 0.02}
{'loss': 0.9073, 'grad_norm': 0.04950644448399544, 'learning_rate': 2.2900763358778628e-05, 'epoch': 0.03}
{'loss': 1.137, 'grad_norm': 0.09392911195755005, 'learning_rate': 3.053435114503817e-05, 'epoch': 0.04}
{'loss': 1.9872, 'grad_norm': 0.6839531660079956, 'learning_rate': 3.816793893129771e-05, 'epoch': 0.05}
{'loss': 0.8592, 'grad_norm': 0.08970703184604645, 'learning_rate': 4.5801526717557256e-05, 'epoch': 0.05}
{'loss': 0.8754, 'grad_norm': 0.0978793352842331, 'learning_rate': 5.3435114503816794e-05, 'epoch': 0.06}
{'loss': 0.9345, 'grad_norm': 0.11809295415878296, 'learning_rate': 6.106870229007635e-05, 'epoch': 0.07}
{'loss': 1.0984, 'grad_norm': 0.18466901779174805, 'learning_rate': 6.870229007633588e-05, 'epoch': 0.08}
{'loss': 1.2895, 'grad_norm': 0.573104679584

In [11]:
trainer.model.push_to_hub("llama2_SAS_finetuned_50_test")
tokenizer.push_to_hub("llama2_SAS_finetuned_50_test")

adapter_model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ayman56/llama2_SAS_finetuned_50_test/commit/c00c4c68988fb8e2a24f12cedf218e711b152541', commit_message='Upload tokenizer', commit_description='', oid='c00c4c68988fb8e2a24f12cedf218e711b152541', pr_url=None, pr_revision=None, pr_num=None)