In [None]:
!pip install -q --upgrade transformers

In [None]:
from huggingface_hub import login
login('')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, recall_score
from tqdm import tqdm

# import csv dataset

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('val.csv')

# Prompt

In [None]:
categories = train_df['label'].unique().tolist()
text = ''
instruction = 'You are Qwen an advanced model specializing classify text.'
prompt = f"""
          You are an expert text classifier.
          Classify the following academic abstract into **exactly one** of the following categories.
          Your answer must be **only one of the following labels**, spelled **exactly as shown** — no explanations, no extra words, and no made-up categories.
          Categories: {categories}
          If the text fits into more than one, choose the most relevant one.
          If the text does not fit exactly, pick the **closest matching** category from the list.
          Do not invent new labels. Do not return anything outside the list.
          inputtext: {text}
          """

In [None]:
def prompt_template(prompt, text):
  prompt = prompt.format(text=text)
  message = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": prompt}
  ]
  return message

# create dataset and dataloader

In [None]:
class ClassificationDataset(Dataset):
  def __init__(self, dataset, tokenizer, prompt):
    self.dataset = dataset
    self.tokenizer = tokenizer
    self.prompt = prompt

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, index):
    text = self.dataset.iloc[index]['text']
    label = self.dataset.iloc[index]['label']
    message = prompt_template(self.prompt, text)
    text_chat_format = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)
    return text_chat_format, label

# Load model and tokenizer

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# model_name = "Qwen/Qwen2.5-0.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="balanced")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

# Classifier  batch function

In [None]:
def batch_classifier(input_text, tokenizer, model):
  encoded_data = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(model.device)
  generated_ids = model.generate(**encoded_data, max_new_tokens=256)
  generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(encoded_data.input_ids, generated_ids)]
  decoded_data = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
  return decoded_data


In [None]:
result = list()
data_set = ClassificationDataset(train_df, tokenizer, prompt)
data_loader = DataLoader(dataset=data_set, batch_size=30, shuffle=True)
for batch in tqdm(data_loader):
  input_text, label = batch
  result.extend(batch_classifier(input_text, tokenizer, model))

In [None]:
train_df['result'] = result
train_df

# Save result as csv file

In [None]:
train_df.to_csv('result_zero_shot_classification_batch.csv', index=False)

# Analyze performance

In [None]:
def get_performance_metrics(dataset):
  y_label = dataset.label
  y_pred = dataset.result

  print("F1_Score(macro avg): ", round(f1_score(y_label, y_pred, average='macro'), 4))
  print("F1_Score(weighted avg): ", round(f1_score(y_label, y_pred, average='weighted'), 4))
  print("Recall Score(macro avg): ",round(recall_score(y_label, y_pred, average='macro'), 4))
  print("Recall Score(weighted avg): ",round(recall_score(y_label, y_pred, average='weighted'), 4))
  print("Accuracy Score: ", round(accuracy_score(y_label, y_pred), 4))

In [None]:
get_performance_metrics(train_df)

In [None]:
train_df[train_df['result'].isnull()]

In [None]:
true_labels = set(train_df['label'].unique())
predicted_labels = set(train_df['result'].unique())

predicted_labels - true_labels