In [None]:
!pip install -q --upgrade transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m7.1/10.5 MB[0m [31m215.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m10.5/10.5 MB[0m [31m222.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m128.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import login
login('')

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, recall_score
from tqdm import tqdm

# import csv dataset

In [9]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('val.csv')

# Prompt

In [10]:
categories = train_df['label'].unique().tolist()
text = ''
instruction = 'You are Qwen an advanced model specializing classify text.'
prompt = f"""
          You are an expert text classifier.
          Classify the following academic abstract into **exactly one** of the following categories.
          Your answer must be **only one of the following labels**, spelled **exactly as shown** — no explanations, no extra words, and no made-up categories.
          Categories: {categories}
          If the text fits into more than one, choose the most relevant one.
          If the text does not fit exactly, pick the **closest matching** category from the list.
          Do not invent new labels. Do not return anything outside the list.
          inputtext: {text}
          """

In [3]:
def prompt_template(prompt, text):
  prompt = prompt.format(text=text)
  message = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": prompt}
  ]
  return message

# create dataset and dataloader

In [4]:
class ClassificationDataset(Dataset):
  def __init__(self, dataset, tokenizer, prompt):
    self.dataset = dataset
    self.tokenizer = tokenizer
    self.prompt = prompt

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, index):
    text = self.dataset.iloc[index]['text']
    label = self.dataset.iloc[index]['label']
    message = prompt_template(self.prompt, text)
    text_chat_format = self.tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)
    return text_chat_format, label

# Load model and tokenizer

In [5]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# model_name = "Qwen/Qwen2.5-0.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="balanced")
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [6]:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

# Classifier  batch function

In [7]:
def batch_classifier(input_text, tokenizer, model):
  encoded_data = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(model.device)
  generated_ids = model.generate(**encoded_data, max_new_tokens=256)
  generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(encoded_data.input_ids, generated_ids)]
  decoded_data = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
  return decoded_data


In [12]:
result = list()
data_set = ClassificationDataset(train_df, tokenizer, prompt)
data_loader = DataLoader(dataset=data_set, batch_size=30, shuffle=True)
for batch in tqdm(data_loader):
  input_text, label = batch
  result.extend(batch_classifier(input_text, tokenizer, model))

100%|██████████| 45/45 [02:01<00:00,  2.70s/it]


In [13]:
train_df['result'] = result
train_df

Unnamed: 0,text,label,result
0,The coupling of laser light to matter can ex...,physics,unspecified
1,5G millimeter wave (mmWave) signals can be u...,electrical engineering and systems science,Physics
2,We developed an experiment to study differen...,astrophysics,science
3,We consider the most general set of integrab...,high energy physics theory,mathematics
4,X-ray photometry and optical spectra are pre...,astrophysics,"science, mathematics"
...,...,...,...
1345,Family systems form the basis of society and...,physics,unspecified
1346,Current air pollution monitoring systems are...,electrical engineering and systems science,science
1347,We develop a formalism for photoionization (...,physics,Physics
1348,Decision making needs to take an uncertain e...,mathematics,science


# Save result as csv file

In [14]:
train_df.to_csv('result_zero_shot_classification_batch.csv', index=False)

# Analyze performance

In [2]:
def get_performance_metrics(dataset):
  y_label = dataset.label
  y_pred = dataset.result

  print("F1_Score(macro avg): ", round(f1_score(y_label, y_pred, average='macro'), 4))
  print("F1_Score(weighted avg): ", round(f1_score(y_label, y_pred, average='weighted'), 4))
  print("Recall Score(macro avg): ",round(recall_score(y_label, y_pred, average='macro'), 4))
  print("Recall Score(weighted avg): ",round(recall_score(y_label, y_pred, average='weighted'), 4))
  print("Accuracy Score: ", round(accuracy_score(y_label, y_pred), 4))

In [4]:
get_performance_metrics(train_df)

F1_Score(macro avg):  0.003
F1_Score(weighted avg):  0.0471
Recall Score(macro avg):  0.002
Recall Score(weighted avg):  0.0311
Accuracy Score:  0.0311


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
train_df[train_df['result'].isnull()]

Unnamed: 0,text,label,result


In [6]:
true_labels = set(train_df['label'].unique())
predicted_labels = set(train_df['result'].unique())

predicted_labels - true_labels

{'Physics',
 'The abstract appears to discuss quantum physics.',
 'The abstract does not clearly fit any of the given categories.',
 'The abstract does not contain any information about physics, electrical engineering, or systems science. It is unrelated to these fields. Therefore, it cannot be classified in any of the provided categories.',
 'The abstract does not contain information that can be classified into any of the given categories.',
 'The abstract is about a paper on quantum mechanics.',
 'The abstract is about a study on quantum mechanics, which falls under the category of **physics**.',
 'The abstract is about quantum physics.',
 'The abstract is about theoretical physics, which falls under the category of physics.',
 'The given inputtext is already a single label, which doesn\'t match any of the provided categories directly. Therefore, I will assign it to the "mathematics" category.\n\nInputtext: The given inputtext is already a single label, which doesn\'t match any of th