In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from IPython.display import clear_output
!pip install peft==0.8.2
!pip install bitsandbytes==0.42.0
!pip install accelerate==0.34.2
!pip install datasets==2.16.1
!pip install GPUtil
!pip install transformers==4.43.1
clear_output()

Collecting peft==0.8.2
  Downloading peft-0.8.2-py3-none-any.whl.metadata (25 kB)
^C
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting bitsandbytes==0.42.0
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h

In [None]:
import warnings
import os
from transformers import set_seed

SEED = 123
set_seed(SEED)

warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

INPUT_DIR = '/kaggle/input/nlp-getting-started/'

DIR = '/kaggle/working/'

NUM_WORKERS = os.cpu_count()
NUM_CLASSES = 2

EPOCHS,R,LORA_ALPHA,LORA_DROPOUT = 5,64,32,0.1
BATCH_SIZE = 8

MODEL_ID = '/kaggle/input/llama-3.2/transformers/1b/1'

In [None]:
from datasets import load_dataset, load

dataset = load_dataset(
    'csv', data_files=f'{INPUT_DIR}train.csv',
)

dataset['test'] = dataset['train']

dataset = dataset.remove_columns(['id', 'keyword', 'location'])
dataset = dataset.rename_column("target", "label")

print(dataset, dataset.keys())
dataset["train"][0], dataset['test'][0], dataset['train'][:5]

In [None]:
from collections import Counter

train_len, test_len = len(dataset['train']), len(dataset['test'])

train_dataset_label_counts = Counter(dataset['train']['label'])
test_dataset_label_counts = Counter(dataset['test']['label'])

print(f"Train dataset: {train_len} samples, {train_dataset_label_counts}")
print(f"Test dataset: {test_len} samples, {test_dataset_label_counts}")

test_majority_class = test_dataset_label_counts.most_common(1)[0]

baseline_accuracy = test_majority_class[1] / test_len

print(f"Baseline accuracy: {baseline_accuracy:.2%}")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print(tokenizer.padding_side, tokenizer.pad_token)
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.padding_side, tokenizer.pad_token)

In [None]:
tokenized_dataset = {}

for split in dataset.keys():
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["text"], truncation=True), batched=True
    )
    

tokenized_dataset["train"], tokenized_dataset["test"]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=NUM_CLASSES,
    load_in_8bit=True,
)
print(model.config.pad_token_id)
model.config.pad_token_id = model.config.eos_token_id
print(model.config.pad_token_id)

In [None]:
print(model)

In [None]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)

model

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.SEQ_CLS,
    target_modules='all-linear'
)
lora_config

In [None]:
lora_model = get_peft_model(model, lora_config)
lora_model

In [None]:
lora_model.print_trainable_parameters()

In [None]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        logging_steps=10,
        report_to="none"
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
print("Evaluating the Model Before Training!")
trainer.evaluate()

In [None]:
print("Training the Model")
trainer.train()

In [None]:
print("Evaluating the trained model")
trainer.evaluate()

In [None]:
print("Saving the model!")
lora_model.save_pretrained('fine-tuned-model')

In [None]:
from transformers import pipeline

clf = pipeline("text-classification", lora_model, tokenizer=MODEL_ID)

In [None]:
import pandas as pd

test_df = pd.read_csv(f"/kaggle/input/nlp-getting-started/test.csv")
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

display(test_df.head())
display(sample_submission.head())

In [None]:
from tqdm import tqdm

predictions = []

print("Making prediction on test dataset...")

for text in tqdm(test_df['text'].values):
    prediction = clf(text)
    prediction = int(prediction[0]['label'].split('_')[1])
    predictions.append(prediction)

In [None]:
sample_submission['target'] = predictions

sample_submission.to_csv(f'submission.csv', index=False)