In [None]:
!pip install -q --upgrade transformers

In [None]:
from huggingface_hub import login
login('')

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, f1_score, recall_score

# import csv dataset

In [None]:
train_df = pd.read_csv('train.csv')

# Prompt

In [None]:
categories = train_df['label'].unique().tolist()
text = ''
instruction = 'You are Qwen an advanced model specializing classify text.'
prompt = f"""
          You are an expert text classifier.
          Classify the following academic abstract into **exactly one** of the following categories.
          Your answer must be **only one of the following labels**, spelled **exactly as shown** — no explanations, no extra words, and no made-up categories.
          Categories: {categories}
          If the text fits into more than one, choose the most relevant one.
          If the text does not fit exactly, pick the **closest matching** category from the list.
          Do not invent new labels. Do not return anything outside the list.
          inputtext: {text}
          """
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": prompt}
]

# Load model and tokenizer

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# model_name = "Qwen/Qwen2.5-0.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="balanced")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

# Classifier function

In [None]:
def classifier(text):
  message = tokenizer.apply_chat_template(text, tokenize=False, add_generation_prompt=True)
  encode_data = tokenizer(message, return_tensors='pt').to(model.device)
  generated_ids = model.generate(**encode_data, max_new_tokens=256)
  generated_ids = [
                output_ids[len(input_ids):]
                for input_ids, output_ids in zip(encode_data.input_ids, generated_ids)
                ]
  decoded_data = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  return decoded_data

In [None]:
# creating batch
# test = train_df.head(10)
# sentence = test['text'].tolist()
# result = []
# batch_size = 5
# for i in tqdm(range(0, len(test), batch_size)):
#   batch = sentence[i:i+batch_size]
#   for text in batch:
#     input = prompt.format(text=text)
#     messages = [
#     {"role": "system", "content": instruction},
#     {"role": "user", "content": input}
#     ]
#     result.extend(classifier(messages))

# test['result'] = result

# Run classifier function

In [None]:
result = []
for text in train_df['text']:
    input = prompt.format(text=text)
    messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": input}
    ]
    result.append(classifier(messages))

train_df['result'] = result

In [None]:
train_df

# Save result as csv file

In [None]:
train_df.to_csv('result_zero_shot_classification', index=False)

# Analyze performance

In [None]:
def get_performance_metrics(dataset):
  y_label = dataset.label
  y_pred = dataset.result

  print("F1_Score(macro avg): ", round(f1_score(y_label, y_pred, average='macro'), 4))
  print("F1_Score(weighted avg): ", round(f1_score(y_label, y_pred, average='weighted'), 4))
  print("Recall Score(macro avg): ",round(recall_score(y_label, y_pred, average='macro'), 4))
  print("Recall Score(weighted avg): ",round(recall_score(y_label, y_pred, average='weighted'), 4))
  print("Accuracy Score: ", round(accuracy_score(y_label, y_pred), 4))

In [None]:
get_performance_metrics(train_df)

In [None]:
train_df[train_df['result'].isnull()]

In [None]:
true_labels = set(train_df['label'].unique())
predicted_labels = set(train_df['result'].unique())

predicted_labels - true_labels