In [6]:
import os
import csv
import json
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.optim import AdamW
from torch.utils.data import Dataset
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [3]:
from llama_distill_logits import TextClassificationDataset, create_collate_fn, load_datasets, evaluate

# MIT LlaMA

In [5]:
# 1. Load the *base* LLaMA model for sequence classification
base_model_name = "meta-llama/Llama-3.2-1B"
tokenizer_name = "tokenizer"
access_token = os.environ.get("HF_TOKEN")

model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    token=access_token,
    num_labels=3,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Set pad token
tokenizer.add_special_tokens({"pad_token":"[PAD]"})
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

# 2. Load the LoRA adapter on top of the base model
adapter_path = "best_model/best_model_r16_b4"
model = PeftModel.from_pretrained(model, adapter_path)

# 3. Create the pipeline with the specified model and tokenizer
pipeline = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer
)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DiffLlamaForSequenceClassification', 'DistilBe

In [None]:
for topic in ["ai"]:
    csv_file = os.path.join("..", "data", f"{topic}_articles.csv")
    df = pd.read_csv

## Finetuned LlaMA

In [7]:
# 1. Load the *base* LLaMA model for sequence classification
base_model_name = "meta-llama/Llama-3.2-1B"
tokenizer_name = "tokenizer"
access_token = os.environ.get("HF_TOKEN")

model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    token=access_token,
    num_labels=5,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Set pad token
tokenizer.add_special_tokens({"pad_token":"[PAD]"})
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

# 2. Load the LoRA adapter on top of the base model
adapter_path = "best_model/best_model_logits_e6"
model = PeftModel.from_pretrained(model, adapter_path)

# 3. Create the pipeline with the specified model and tokenizer
pipeline = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer
)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DiffLlamaForSequenceClassification', 'DistilBe

In [4]:
# Load data and create dataloaders
train_dataloader, val_dataloader = load_datasets(
    "../data/topics_10k_deepseek_logits.json", None, tokenizer)

In [7]:
evaluate(model, val_dataloader, "cuda")

100%|██████████| 200/200 [02:43<00:00,  1.22it/s]


(0.8220893393084406, 0.724937343358396)

In [9]:
for topic in tqdm(["ai", "climate-change", "israeli-palestinian-conflict"]):
    csv_file = os.path.join("..", "data", f"{topic}_articles_llama.csv")
    if not os.path.exists(csv_file):
        print("Not found")
        continue

    df = pd.read_csv(csv_file, encoding="utf-8")
    labels = pipeline((df["title"] + " " + df["article_text"]).to_list())
    for label in labels:
        label["label"] = label["label"][-1]

    df_labels = pd.DataFrame(labels)
    df_combine = pd.concat([df.reset_index(drop=True), df_labels], axis=1)
    df_combine = df_combine.rename(columns={"label": "llama_distill_bias"})

    output_file = os.path.join("..", "data", f"{topic}_articles_llama_distill.csv")
    df_combine.to_csv(output_file, encoding="utf-8", quoting=csv.QUOTE_NONNUMERIC, index=False)

100%|██████████| 3/3 [2:33:08<00:00, 3062.78s/it]  
