# Finetuning Using Google Gemma's Model

In [None]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [None]:
import os
import transformers
import torch
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
model_id = "google/gemma-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=hf_token)

In [None]:
os.environ["WANDB_DISABLED"] = "false"

In [None]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [None]:
import pandas as pd
import re
import string 
df = pd.read_csv('/kaggle/input/mbti-type/mbti_1.csv')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['type'], axis = 1), df['type'], test_size=0.2, random_state=42)

In [None]:
import re
import re
from tqdm import tqdm

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    # Remove all symbols except full stops
    text = re.sub('[^a-zA-Z0-9.\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub('\s+', ' ', text).strip()
    return text

# Apply preprocess_text function to each element in df using tqdm
tqdm.pandas()
X_train_prep = X_train.progress_applymap(preprocess_text)
X_test_prep = X_test.progress_applymap(preprocess_text)

In [None]:
X_train_prep.head(5)

In [None]:
from datasets import Dataset
train = pd.concat([X_train, y_train], axis=1)

test = pd.concat([X_test, y_test], axis=1)


train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)


In [None]:
train = train.map(lambda samples: tokenizer(samples["posts"]), batched=True)

In [None]:
training_args = transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    )

In [None]:
def formatting_func(example):
    text = f"Text: {example['posts'][0]}\n Personality: {example['type'][0]}"
    return [text]

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset = test,
    args=training_args,
    peft_config=lora_config,
    formatting_func=formatting_func,
)

In [None]:
trainer.train()

In [None]:
text = '''Text: I strongly disagree with that I am the life of the party and strongly disagree with
          that I sympathize with others' feelings and strongly disagree with that I get chores done
          right away and agree with that I have frequent mood swings and strongly disagree with that
          I have a vivid imagination and agree with that I do not talk a lot and disagree with that
          I am not interested in other people's problems and disagree with that I often forget to 
          put things back in their proper place and agree with that I am relaxed most of the time 
          and strongly agree with that I am not interested in abstract ideas and strongly agree 
          with that I talk to a lot of different people at parties and disagree with that I feel
          others' emotions and strongly agree with that I like order and agree with that I get 
          upset easily and neutral with that  I have difficulty understanding abstract ideas and 
          strongly agree with that I keep in the background and strongly agree with that I am not 
          really interested in others and strongly agree with that I make a mess of things and 
          strongly agree with that I seldom feel blue and disagree with that I am not have a good 
          imagination'''
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))