In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
from tqdm import tqdm

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    quantization_config=quantization_config,
    device_map="auto",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]Cancellation requested; stopping current tasks.
Fetching 4 files:   0%|          | 0/4 [00:40<?, ?it/s]


KeyboardInterrupt: 

In [None]:
def classify_batch(comments_batch, batch_size=4):
    """Process multiple comments efficiently"""
    predictions = []
    
    for i in range(0, len(comments_batch), batch_size):
        batch = comments_batch[i:i+batch_size]
        
        prompts = []
        for comment in batch:
            prompt_text = f"""You are an expert classifier of political stance in English comments about the Ukraineâ€“Russia war.

Your task:
Classify the stance of THIS comment and answer with EXACTLY one word:
- prorussian
- proukrainian
- neutral

Output rules:
- Output MUST be exactly one of: prorussian, proukrainian, neutral.
- Do NOT add any other words, punctuation, quotes, or explanations.
- Answer with ONLY ONE WORD.

Examples:
Comment: "Ruzzia is pure evil! Ukraine has to win!!!"
Classification: proukrainian

Comment: "Slava Ukraini!"
Classification: proukrainian

Comment: "Russia isn't the bad guy, the West lies about everything"
Classification: prorussian

Comment: "This war is horrible for both Russians and Ukrainians"
Classification: neutral

Comment: "Ukraine is Nazi, Russia is liberating"
Classification: prorussian

Comment: "Putler must be stopped"
Classification: proukrainian

NOW CLASSIFY THIS COMMENT:

Comment: "{comment}"

Classification:
"""
            messages = [{"role": "user", "content": prompt_text}]
            prompts.append(tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            ))
        
        inputs = tokenizer(prompts, return_tensors="pt", 
                          padding=True, truncation=True, 
                          max_length=512).to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,
                temperature=0.1,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        
        for j, output in enumerate(outputs):
            generated_ids = output[inputs['input_ids'][j].shape[0]:]
            prediction = tokenizer.decode(generated_ids, skip_special_tokens=True).strip().lower()
            
            if 'prorussian' in prediction:
                prediction = 'prorussian'
            elif 'proukrainian' in prediction:
                prediction = 'proukrainian'
            elif 'neutral' in prediction:
                prediction = 'neutral'
            else:
                prediction = 'neutral'
            
            predictions.append(prediction)
    
    return predictions


In [None]:
df = pd.read_csv('data/youtube_comments.csv')
print(f"Processing {len(df)} comments...")

chunk_size = 1000
all_predictions = []
max_len = 100_000
for i in tqdm(range(0, max_len, chunk_size)):
    chunk = df['comment'].iloc[i:i+chunk_size].tolist()
    predictions = classify_batch(chunk, batch_size=4)
    all_predictions.extend(predictions)

df['label'] = all_predictions
df.to_csv('data/labeled_comments_for_distilbert.csv', index=False)

print("\nLabel distribution:")
print(df['label'].value_counts())