In [1]:

!pip install transformers datasets evaluate -q
!pip install torch torchvision torchaudio -q
!pip install scikit-learn pandas numpy -q


import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
import evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

from google.colab import files
uploaded = files.upload()

file_name = list(uploaded.keys())[0]

Saving heldout_pair_data.jsonlist to heldout_pair_data.jsonlist


In [3]:

import json
data = [json.loads(line) for line in open(file_name, 'r')]
df = pd.DataFrame(data)
print("Columns:", df.columns.tolist())


pos_df = pd.DataFrame({
    "text": df["positive"],
    "label": 1
})
neg_df = pd.DataFrame({
    "text": df["negative"],
    "label": 0
})

combined_df = pd.concat([pos_df, neg_df], ignore_index=True)
combined_df = combined_df.dropna(subset=["text"]).reset_index(drop=True)


combined_df["text"] = combined_df["text"].apply(
    lambda x: " ".join(x) if isinstance(x, list) else str(x)
)

print("Total samples:", len(combined_df))
combined_df.head()

Columns: ['op_author', 'op_text', 'op_title', 'positive', 'negative', 'op_name']
Total samples: 1614


Unnamed: 0,text,label
0,"{'ancestor': 't1_cundk5r', 'author': 'ghoooooo...",1
1,"{'ancestor': 't1_cunbkbz', 'author': 'archagon...",1
2,"{'ancestor': 't1_cun0c3t', 'author': 'huadpe',...",1
3,"{'ancestor': 't1_cumn3j4', 'author': 'ReOsIr10...",1
4,"{'ancestor': 't1_cumhf65', 'author': 'BadKeyMa...",1


In [4]:
train_df, test_df = train_test_split(
    combined_df,
    test_size=0.2,
    random_state=42,
    stratify=combined_df["label"]
)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [6]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


for name, param in model.deberta.embeddings.named_parameters():
    param.requires_grad = False
for name, param in model.deberta.encoder.layer[:6].named_parameters():
    param.requires_grad = False


batch_size = 8
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = metric.compute(predictions=predictions, references=labels)
    return acc

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
)



Map:   0%|          | 0/1291 [00:00<?, ? examples/s]

Map:   0%|          | 0/323 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)


trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6883,0.623425,0.678019
2,0.5796,0.572722,0.684211
3,0.4999,0.552069,0.702786
4,0.5352,0.58922,0.674923
5,0.4725,0.594616,0.684211
6,0.4577,0.659474,0.684211


TrainOutput(global_step=972, training_loss=0.5376774734920926, metrics={'train_runtime': 1029.8373, 'train_samples_per_second': 12.536, 'train_steps_per_second': 1.573, 'total_flos': 2038094785032192.0, 'train_loss': 0.5376774734920926, 'epoch': 6.0})

In [None]:
results = trainer.evaluate()
print(" Final Evaluation Results:", results)


trainer.save_model("./deberta_persuasion_model")
print("Model saved to ./deberta_persuasion_model")

 Final Evaluation Results: {'eval_loss': 0.6117265820503235, 'eval_accuracy': 0.6965944272445821, 'eval_runtime': 6.8737, 'eval_samples_per_second': 46.991, 'eval_steps_per_second': 5.965, 'epoch': 6.0}
Model saved to ./deberta_persuasion_model
