Make necessary installs

In [1]:
!pip install transformers peft datasets accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import train_test_split
import pandas as pd


In [6]:
# Load 20 Newsgroups dataset and filter specific categories
dataset = load_dataset("TopicNet/20-Newsgroups")

# Select distinct categories
selected_categories = ['sci.space', 'rec.autos', 'comp.graphics', 'talk.politics.mideast']
dataset = dataset.filter(lambda example: example['topic'] in selected_categories)

# Map categories to numerical labels
label_mapping = {label: i for i, label in enumerate(selected_categories)}
dataset = dataset.map(lambda example: {'label': label_mapping[example['topic']]})
dataset = dataset.rename_column('text', 'sentence')

# Split into train and test sets
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Show a sample
print(train_dataset[0])

{'Unnamed: 0': 1,
 'raw_text': "A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.",
 'filenames': '/home/egorov/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861',
 'target': 4,
 'id': 'comp_sys_mac_hardware_51861',
 'tokenized': "[('fair', 'JJ'), ('number', 'NN'), ('of', 'IN'), ('brave', 'JJ'), ('souls', 'NNS'), ('who', 'WP'), ('upgraded', 'VBD'), ('their', 'PRP$'), ('si', 'NN'), ('clock', 'NN'), ('oscillator', 'NN'), ('have', 'VBP'), ('shared', 'VBN'), ('their', 'PRP$'), ('exp

In [None]:
# Load tokenizer for LLaMA model
MODEL_NAME = "meta-llama/Meta-Llama-3-1B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token if not available

def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:
# Load base model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(selected_categories),
    device_map="auto"
)

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # Apply LoRA to query and value layers
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True
)

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=None  # Add a custom metric function later if desired
)


In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
# Save the model and tokenizer
model.save_pretrained("./fine-tuned-llama3-1B-topic-classification")
tokenizer.save_pretrained("./fine-tuned-llama3-1B-topic-classification")