In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.utils import resample
import torch
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
torch.cuda.empty_cache()

In [4]:
pd_power = pd.read_csv('drive/MyDrive/nlp-the2/power-tr-train.tsv', sep='\t')
train_data, test_data = train_test_split(
    pd_power,
    test_size=0.1,  # 10% for test
    random_state=42,  # Set seed for reproducibility
    stratify=pd_power['label']  # Ensure stratified split
)

In [5]:
train_data, test_data = train_test_split(
    pd_power,
    test_size=0.15,  # 15% for test
    random_state=42,  # Set seed for reproducibility
    stratify=pd_power['label']  # Ensure stratified split
)

# Separate features and labels for training and testing
X_test = test_data['text_en'].tolist()
y_test = test_data['label'].tolist()

In [6]:
test_data.head()

Unnamed: 0,id,speaker,sex,text,text_en,label
1924,tr20070,fb401143250252b5f33e65b4362996ca,M,"Sayın Başkan, değerli milletvekilleri; Türkiye...","Mr. President, I have made a pledge on behalf ...",0
14654,tr32800,2efb1e2a1d31942ce96ed6dfbae8b618,M,"Değerli milletvekilleri, Genel Kurulu, ekranla...","Thank you, Mr. President. <p> Dear MPs, the Ge...",1
5891,tr24037,6e003ae7de4071e29316da2202b969fd,M,Teşekkür ediyorum Sayın Başkan. <p> Değerli mi...,"Thank you, Mr. President. <p> Dear MPs, I have...",1
9966,tr28112,3111816494ef12983fb44a2b78d79fae,M,"Ayrıca, bakın, tutuklu gazetecilerden 5 tanesi...","Also, look, five of the arrested journalists h...",0
15607,tr33753,c741f3456e5f4bd5bda6381fbc0730e2,M,Öncelikle şunu belirtmek istiyorum: Geride kal...,"Mr. President, I have made a promise to addres...",1


In [7]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [8]:
def generate_predictions_batched(speeches, tokenizer, model, batch_size=16):
    predictions = []

    for i in tqdm(range(0, len(speeches), batch_size)):
        batch = speeches[i:i + batch_size]
        prompts = [f"""Classify the following parliamentary speech is belong to the governing or opposition party as either:
        0: governing
        1: opposition

        Speech: "{speech}"
        Answer with just the number 0 or 1.""" for speech in batch]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        )

        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1,  # Limit response length
                pad_token_id=tokenizer.eos_token_id,
                temperature=0.1,   # Reduce randomness
                do_sample=False
            )

        # Process batch predictions
        for output in outputs:
            pred_text = tokenizer.decode(output, skip_special_tokens=True)
            try:
                pred_label = int(''.join(filter(str.isdigit, pred_text.split()[-1])))
                if pred_label not in [0, 1]:
                    pred_label = None
            except (ValueError, IndexError):
                pred_label = None
            predictions.append(pred_label)

        torch.cuda.empty_cache()

    return predictions

In [9]:
y_pred = generate_predictions_batched(X_test, tokenizer, model)

100%|██████████| 163/163 [01:47<00:00,  1.51it/s]


In [11]:
# Remove any None predictions for evaluation
valid_indices = [i for i, pred in enumerate(y_pred) if pred is not None]
valid_predictions = [y_pred[i] for i in valid_indices]
valid_y_test = [y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i] for i in valid_indices]

# Evaluate
if len(valid_predictions) > 0:
    print("\nModel Performance:")
    print(f"Accuracy: {accuracy_score(valid_y_test, valid_predictions):.3f}")
    print("\nClassification Report:")
    print(classification_report(valid_y_test, valid_predictions,
                              target_names=['left-leaning', 'right-leaning']))

    print(f"\nPrediction Summary:")
    print(f"Total test samples: {len(y_test)}")
    print(f"Valid predictions: {len(valid_predictions)}")
    print(f"Invalid/None predictions: {y_pred.count(None)}")
else:
    print("No valid predictions were generated.")


Model Performance:
Accuracy: 0.492

Classification Report:
               precision    recall  f1-score   support

 left-leaning       0.33      0.00      0.00       583
right-leaning       0.49      1.00      0.66       567

     accuracy                           0.49      1150
    macro avg       0.41      0.50      0.33      1150
 weighted avg       0.41      0.49      0.33      1150


Prediction Summary:
Total test samples: 2608
Valid predictions: 1150
Invalid/None predictions: 1458


In [12]:
X_test_turkish = test_data['text'].tolist()
y_test = test_data['label'].tolist()

In [13]:
def generate_predictions_batched_turkish(speeches, tokenizer, model, batch_size=16):
    predictions = []

    for i in tqdm(range(0, len(speeches), batch_size)):
        batch = speeches[i:i + batch_size]
        prompts = [f"""
        Aşağıdaki parlamento konuşmasını şu şekilde sınıflandırın:
        0: hükümet
        1: muhalefet

        Konuşma: "{speech}"
        Sadece 0 veya 1 olarak cevap verin.
        """ for speech in batch]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        )

        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1,  # Limit response length
                pad_token_id=tokenizer.eos_token_id,
                temperature=0.1,   # Reduce randomness
                do_sample=False
            )

        # Process batch predictions
        for output in outputs:
            pred_text = tokenizer.decode(output, skip_special_tokens=True)
            try:
                pred_label = int(''.join(filter(str.isdigit, pred_text.split()[-1])))
                if pred_label not in [0, 1]:
                    pred_label = None
            except (ValueError, IndexError):
                pred_label = None
            predictions.append(pred_label)

        torch.cuda.empty_cache()

    return predictions

In [14]:
y_pred = generate_predictions_batched_turkish(X_test_turkish, tokenizer, model)

100%|██████████| 163/163 [01:46<00:00,  1.53it/s]


In [16]:
# Remove any None predictions for evaluation
valid_indices = [i for i, pred in enumerate(y_pred) if pred is not None]
valid_predictions = [y_pred[i] for i in valid_indices]
valid_y_test = [y_test.iloc[i] if hasattr(y_test, 'iloc') else y_test[i] for i in valid_indices]

# Evaluate
if len(valid_predictions) > 0:
    print("\nModel Performance:")
    print(f"Accuracy: {accuracy_score(valid_y_test, valid_predictions):.3f}")
    print("\nClassification Report:")
    print(classification_report(valid_y_test, valid_predictions,
                              target_names=['left-leaning', 'right-leaning']))

    print(f"\nPrediction Summary:")
    print(f"Total test samples: {len(y_test)}")
    print(f"Valid predictions: {len(valid_predictions)}")


Model Performance:
Accuracy: 0.476

Classification Report:
               precision    recall  f1-score   support

 left-leaning       0.48      0.98      0.64       272
right-leaning       0.50      0.02      0.03       299

     accuracy                           0.48       571
    macro avg       0.49      0.50      0.34       571
 weighted avg       0.49      0.48      0.32       571


Prediction Summary:
Total test samples: 2608
Valid predictions: 571
