In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install peft
#!pip install trl
!pip install bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install accelerate

In [None]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)

In [None]:
%pip install -U datasets

In [None]:
!pip install trl

In [None]:
from datasets import load_dataset
from trl import SFTTrainer

In [None]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
# Chose the base model you want
model_name = "Hugofernandez/Mistral-7B-v0.1-colab-sharded"
# set device
device = 'cuda'
#v Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

In [None]:
#Quantization as defined https://huggingface.co/docs/optimum/concept_guides/quantization will help us reduce the size of the model for it to fit on a single GPU
#Quantization configuration
compute_dtype = getattr(torch, "float16")
print(compute_dtype)
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [None]:
pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
#Load the model and quantize it
model = AutoModelForCausalLM.from_pretrained(
          model_name,
          quantization_config=bnb_config,
          use_flash_attention_2 = False, #set to True you're using A100
          device_map={"": 0}, #device_map="auto" will cause a problem in the training

)

In [None]:
print(model)
#You can see that all the layers are Linear4bit

In [None]:
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=16,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj", "lm_head",]
)

In [None]:
#Cast some modules of the model to fp32
model = prepare_model_for_kbit_training(model)
#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

In [None]:
training_arguments = TrainingArguments(
        output_dir="./results", # directory in which the checkpoint will be saved.
        evaluation_strategy="epoch", # you can set it to 'steps' to eval it every eval_steps
        optim="paged_adamw_8bit", #used with QLoRA
        per_device_train_batch_size=4, #batch size
        per_device_eval_batch_size=4, #same but for evaluation
        gradient_accumulation_steps=1, #number of lines to accumulate gradient, carefull because it changes the size of a "step".Therefore, logging, evaluation, save will be conducted every gradient_accumulation_steps * xxx_step training example
        log_level="debug", #you can set it to  ‘info’, ‘warning’, ‘error’ and ‘critical’
        save_steps=500, #number of steps between checkpoints
        logging_steps=20, #number of steps between logging of the loss for monitoring adapt it to your dataset size
        learning_rate=4e-4, #you can try different value for this hyperparameter
        num_train_epochs=1,
        warmup_steps=100,
        lr_scheduler_type="constant",
)

In [None]:
import os

# Define the root directory
root_dir = '/kaggle/input'

# Loop through directories and files
for dirname, _, filenames in os.walk(os.path.join(root_dir, 'company-slogans')):
    # Create the "dataset" folder within "company-slogans" if it doesn't exist
    dataset_folder = os.path.join(dirname, 'dataset')
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

def split_csv(input_file, train_output_file, test_output_file, test_size=0.2, random_state=None):
    # Read the CSV file into a DataFrame
    data = pd.read_csv(input_file)
    data = data.dropna()
    data['combined'] = '### Product: ' + data['company'] + ' \n### Slogan: ' + data['slogan']

    # Split the data into training and testing sets
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state)

    # Write the training and testing sets to separate CSV files
    train_data.to_csv(train_output_file, index=False)
    test_data.to_csv(test_output_file, index=False)

# Example usage:
input_file = '/kaggle/input/company-slogans/slogans.csv'  # Replace 'data.csv' with the path to your CSV file
train_output_file = 'train.csv'
test_output_file = 'test.csv'
split_csv(input_file, train_output_file, test_output_file, test_size=0.2, random_state=42)

In [None]:
dataset['test']['combined']

In [None]:
# First import your own dataset in the default folder which "content" on colab
# The dataset should have one column named "text" with one example per line
data_files = {'train': "/kaggle/working/train.csv", 'test': "/kaggle/working/test.csv"}
dataset = load_dataset('csv', data_files=data_files)

#dataset = load_dataset("json", data_files="path/to/dataset.jsonl", split="train")

# Verify the chat template and apply it to you data
# tokenizer.apply_chat_template(chat, tokenize=False)
# Otherwise you can use dataset that are present on https://huggingface.co/datasets
# dataset = load_dataset({DATASET_PATH})

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        peft_config=peft_config,
        dataset_text_field="combined",
        #packing = True
        max_seq_length=1024,
        tokenizer=tokenizer,
        args=training_arguments,
)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = model.num_parameters()
    for _, param in model.named_parameters():
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print_trainable_parameters(model)

In [None]:
# Run an evaluation step
trainer.evaluate()

In [None]:
# Launch the training
trainer.train()

In [None]:
#trainer.evaluate()
#eval_prompt = """<s>[INST]What is a Neural Network, and how does it work?[/INST]"""

eval_prompt = "### Product: Mercedes Car \n### Slogan: The"

# import random
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=64, pad_token_id=2)[0], skip_special_tokens=True))

In [None]:
model.train()

In [None]:
new_model = 'MistralAI_QLoRa_Ads'
trainer.model.save_pretrained(new_model)

In [None]:
#Load the base model
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
peft_model = PeftModel.from_pretrained(base_model, new_model)
merged_model = peft_model.merge_and_unload()

In [None]:
output_merged_dir = "/content/MistralAI_finetuned_Ads"

os.makedirs(output_merged_dir, exist_ok=True)
merged_model.save_pretrained(output_merged_dir, safe_serialization = False)
tokenizer.save_pretrained(output_merged_dir)

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login

merged_model.push_to_hub("0sparsh2/MistralAI_finetuned_Ads", check_pr=True)

tokenizer.push_to_hub("0sparsh2/MistralAI_finetuned_Ads",check_pr=True)
