In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from transformers import Trainer
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, get_scheduler
from transformers import AdamW
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils import resample
from tqdm import tqdm
import wandb

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
pd_power = pd.read_csv('drive/MyDrive/nlp-the2/power-tr-train.tsv', sep='\t')

In [4]:
pd_power.head()

Unnamed: 0,id,speaker,sex,text,text_en,label
0,tr18146,ca2031caa4032c51980160359953d507,M,"Yeni yasama döneminin ülkemiz için, milletimiz...","Mr. President, dear lawmakers, I salute you, a...",0
1,tr18147,4cee0addb3c69f6866869b180f90d45f,M,"Sayın Başkan, değerli milletvekilleri; bugün, ...","Mr. President, members of lawmakers, as I spea...",0
2,tr18148,b3d7f76d74ec268492f8190ca123a6b2,M,"Sayın Başkanım, öncelikle yüce Meclisin Başkan...","Mr. President, I'm here to share with you the ...",0
3,tr18149,722efac7138c8197a9d1e97eed3a8b18,M,24’üncü Dönem Meclis Başkanlığına seçilmenizde...,"Mr. President, under the principles determined...",0
4,tr18150,fcc61122f3553c57ae207adeb1a1af84,M,Usul tartışmasında 2 kişi lehte 2 kişi aleyhte...,"Two in favour of two in the legal debate, Mr. ...",1


In [7]:
majority = pd_power[pd_power['label'] == 1]
minority = pd_power[pd_power['label'] == 0]
minority_oversampled = resample(
    minority,
    replace=True,
    n_samples=len(majority),
    random_state=42
)
oversampled_data = pd.concat([majority, minority_oversampled])
pd_power = oversampled_data


In [8]:
X = pd_power['text_en']  # Using English translations
y = pd_power['label']    # Labels (0 for left, 1 for right)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.10,
    random_state=42,
    stratify=y    # This ensures proportional split of labels
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 16077
Test set size: 1787


In [13]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [14]:
max_length = 512

def tokenize_data(texts, tokenizer, max_length):
    return tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )


train_encodings = tokenize_data(X_train, tokenizer, max_length)
test_encodings = tokenize_data(X_test, tokenizer, max_length)

In [15]:
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)


class TurkishParliamentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: tensor[idx] for key, tensor in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create datasets
train_dataset = TurkishParliamentDataset(train_encodings, train_labels)
test_dataset = TurkishParliamentDataset(test_encodings, test_labels)

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2  # Binary classification: 0 (left), 1 (right)
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision = precision_score(labels, preds, average="binary")
    recall = recall_score(labels, preds, average="binary")
    f1 = f1_score(labels, preds, average="binary")
    accuracy = accuracy_score(labels, preds)

    # Log metrics to WandB
    wandb.log({
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy
    })

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at regular intervals
    eval_steps=400,               # Frequency of evaluation
    learning_rate=2e-5,           # Initial learning rate
    lr_scheduler_type="linear", # Gradually reduce learning rate
    per_device_train_batch_size=32,  # Increase batch size if GPU allows
    per_device_eval_batch_size=16,
    num_train_epochs=3,           # Reduce epochs to prevent overfitting
    weight_decay=0.025,            # Slightly increased weight decay
    logging_dir="./logs",
    logging_steps=50,             # More frequent logging
    save_strategy="epoch",       # Save model at each epoch
    save_total_limit=2,           # Limit saved models to save space
    load_best_model_at_end=True,  # Automatically load the best model
    metric_for_best_model="loss", # Validation loss as the key metric
    report_to="wandb",           # Log to WandB
)

# Initialize WandB
wandb.init(project="nlp_power_task", mode="online")

# Optimizer and Learning Rate Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
steps_per_epoch = len(train_dataset) // training_args.per_device_train_batch_size
num_training_steps = steps_per_epoch * training_args.num_train_epochs

lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3149,0.303873,0.925373,0.833147,0.876841,0.883044


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3149,0.303873,0.925373,0.833147,0.876841,0.883044
2,0.1863,0.241016,0.921387,0.892497,0.906712,0.908226
3,0.1001,0.275388,0.946429,0.890258,0.917484,0.919978


TrainOutput(global_step=1509, training_loss=0.25729318590176825, metrics={'train_runtime': 925.1818, 'train_samples_per_second': 52.131, 'train_steps_per_second': 1.631, 'total_flos': 1.269010931106816e+16, 'train_loss': 0.25729318590176825, 'epoch': 3.0})

In [19]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.24101638793945312, 'eval_precision': 0.9213872832369943, 'eval_recall': 0.8924972004479284, 'eval_f1': 0.906712172923777, 'eval_accuracy': 0.9082260772243984, 'eval_runtime': 11.0341, 'eval_samples_per_second': 161.952, 'eval_steps_per_second': 10.15, 'epoch': 3.0}
