# Train Adapt-LLM on few examples

In [None]:
#pip install peft

In [None]:
#%pip uninstall peft -y
#%pip install git+https://github.com/huggingface/peft
#%pip install git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08

In [1]:
 !rm -rf /kaggle/working/*

In [None]:
#%pip install git+https://github.com/huggingface/peft
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl 
#%pip install https://pypi.org/simple/ bitsandbytes

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, BitsAndBytesConfig,HfArgumentParser,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from datasets import DatasetDict
from trl import SFTTrainer, SFTConfig
import accelerate
import pandas as pd

In [None]:
model = "AdaptLLM/finance-chat"

# Load base model
bnb_config = BitsAndBytesConfig(  
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model_upd = AutoModelForCausalLM.from_pretrained(
        model,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model_upd.config.use_cache = False # silence the warnings. Please re-enable for inference!
model_upd.config.pretraining_tp = 1
model_upd.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
### Adding the adapters in the layers
model_upd = prepare_model_for_kbit_training(model_upd)
peft_config = LoraConfig(
    lora_alpha=512,
    lora_dropout=0.1,
    r=512,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model_upd = get_peft_model(model_upd, peft_config)

In [None]:
#Hyperparamter
training_arguments = SFTConfig(
    output_dir="./results",
    num_train_epochs=170, #100 #40
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=425 ,#840,#600 #240,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model_upd))

## Tokenization

In [None]:
## GHOST
'''
data=pd.read_csv("/kaggle/input/update-gh/Fraud_Ghost_end.csv")
test_data=pd.read_csv("/kaggle/input/ghossttt/Test_Fraud_Ghost_end.csv")

#data = data.drop(data.columns[0], axis=1)
#test_data = test_data.drop(test_data.columns[0], axis=1)

json_data = data.to_json(orient='records')
json_test_data = test_data.to_json(orient='records')
'''

In [None]:
## Misclassification
'''
data=pd.read_csv("/kaggle/input/misclasss/New_train_M.csv")
test_data=pd.read_csv("/kaggle/input/misclasss/New_test_M.csv.csv")

#data = data.drop(data.columns[0], axis=1)
#test_data = test_data.drop(test_data.columns[0], axis=1)
rch
json_data = data.to_json(orient='records')
json_test_data = test_data.to_json(orient='records')
'''

In [2]:
## Timesheet

data=pd.read_csv("/kaggle/input/latest-upd-tsh/Timesheet_100_input_anonym.csv")
test_data=pd.read_csv("/kaggle/input/latest-upd-tsh/Timesheet_test_30_anonym.csv")

#data = data.drop(data.columns[0], axis=1)
#test_data = test_data.drop(test_data.columns[0], axis=1)

json_data = data.to_json(orient='records')
json_test_data = test_data.to_json(orient='records')


NameError: name 'pd' is not defined

In [None]:
L=json_data.split('},{')
L_test = json_test_data.split('},{')

L[0]=L[0][2:]
L[-1]=L[-1][:-2]

L_test[0]=L_test[0][2:]
L_test[-1]=L_test[-1][:-2]

len(L)
#len(L_test)

In [None]:
def formatList(L):
    L_res=[]
    i=0

    while i < len(L) : 
        strr="{ " + L[i] + " },"
        j=i+1
        while (j< len(L)) and (L[i].split(",")[0]== L[j].split(",")[0]): 
            strr+= "{ " + L[j].split('"Explanation":')[0] + " },"
            j+=1
           
        L_res.append(strr + ' "Explanation":' + L[j-1].split('"Explanation":')[1])
        print(len(L_res))
        i=j
        
    return L_res

In [None]:
L_format = formatList(L)
L_test_format = formatList(L_test)

In [None]:
for i in range(len(L_format)) : 
    Scenario = L_format[i].split('"Explanation":"')
    L_format[i]= Scenario[0] + '\n "Explanation":"' + Scenario[1]

for i in range(len(L_test_format)) : 
    Scenario = L_test_format[i].split('"Explanation":"')
    L_test_format[i]= Scenario[0] + '\n "Explanation":"' + Scenario[1]
    

In [None]:
L_format

In [None]:
# Timesheet

for i in range(len(L_format)) : 
    L_format[i]="Generate a Timesheet Fraud Scenario : { " + L_format[i] + " },"

for i in range(len(L_test_format)) : 
    L_test_format[i]="Generate a Timesheet Fraud Scenario : { " + L_test_format[i] + " },"
    
L_test_format[-1]
L_format[-1]

In [None]:
# GHOST

'''
for i in range(len(L_format)) : 
    L_format[i]="Generate a Ghost Fraud Scenario : { " + L_format[i] + " },"

for i in range(len(L_test_format)) : 
    L_test_format[i]="Generate a Ghost Fraud Scenario : { " + L_test_format[i] + " },"
    
L_test_format[2]
'''

In [None]:
# Misclassification

'''
for i in range(len(L_format)) : 
    L_format[i]="Generate a Misclassification Fraud Scenario : { " + L_format[i] + " },"

for i in range(len(L_test_format)) : 
    L_test_format[i]="Generate a Misclassification Fraud Scenario : { " + L_test_format[i] + " },"
    
L_test_format[2]
'''

### Prepare data format

In [None]:
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset

#dataset = ds.dataset(pa.Table.from_pandas(data).to_batches())

### convert to Huggingface dataset
#hg_dataset = Dataset(pa.Table.from_pandas(data))

In [None]:
d = pd.DataFrame(L_format)
dataset = ds.dataset(pa.Table.from_pandas(d).to_batches())
### convert to Huggingface dataset
hg_dataset = Dataset(pa.Table.from_pandas(d))

test_d = pd.DataFrame(L_test_format)
test_dataset = ds.dataset(pa.Table.from_pandas(test_d).to_batches())
### convert to Huggingface dataset
test_hg_dataset = Dataset(pa.Table.from_pandas(test_d))

In [None]:
#test_hg_dataset

In [None]:
#hg_dataset

## Train model

In [None]:
'''
from transformers import EarlyStoppingCallback

# Define early stopping parameters
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of evaluations with no improvement before stopping
    load_best_model_at_end = True
)

training_arguments.evaluation_strategy = "steps"
training_arguments.eval_steps = 25  
'''

In [None]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model_upd,
    train_dataset=hg_dataset,
    eval_dataset=test_hg_dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="0",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False
)

In [None]:
wandb.login(key='5ecd16230ada8cfbb0afc08d9a978ba0593ad51f')

# Initialize a new wandb run
wandb.init(project="adapt-timesheet-1", name="run-8")

In [None]:
#import torch
#torch.cuda.empty_cache()

In [None]:
training_output = trainer.train()

____________________________________
# Notes

include in a RAG file :
- How the attributes values are calculated
- Baseline logic for how posts should be given based on seniority (rule style)
- information about what year we currently in (reference)

## Pushing model to huggingface

In [None]:
#access token
hg_token= "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"
import huggingface_hub
huggingface_hub.login(hg_token)

In [None]:
trainer.save_model('adapt-llm-Timesheet-Fr-170xr512')

In [None]:
#model_upd.push_to_hub("adapt-llm-Misc-Fr")

In [None]:
#from huggingface_hub import login
#login()

In [None]:
from huggingface_hub import HfApi, create_repo
api = HfApi()

api.create_repo(repo_id="adapt-llm-Timesheet-Fr-170xr512")

In [None]:
api.upload_folder(    
    folder_path="./adapt-llm-Timesheet-Fr-170xr512",
    repo_id="FO-UA/adapt-llm-Timesheet-Fr-170xr512",
    repo_type="model"
)