In [1]:
import numpy as np
import pandas as pd
import os
from datasets import load_dataset, load_metric
import pickle
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 

os.environ["WANDB_DISABLED"] = "true"

In [2]:
train_dataset = load_dataset('csv', data_files='../dataset_raw.csv',split="train")

In [3]:
metric = load_metric('glue', 'sst2')

  metric = load_metric('glue', 'sst2')


In [4]:
train_dataset = train_dataset.rename_columns({"category_num" : "label","example" : "sentence"})
train_dataset = train_dataset.train_test_split(test_size=0.1)

In [5]:
train_dataset['train'][0]

{'sentence': 'in Muslim Pakistan, clearly an ethnic nationalist rather than jihadi struggle: Islamabad in fact accuses India of supporting it. # The United States has virtually nothing to do with the Baluch. Most Americans have never heard of them. What motive might these people have for attacking us, and – above all – why should this group be at the core of the " new terrorism "? Just to ask these questions, however, is to dissent sharply from the current intelligence orthodoxy; and that is undoubtedly a major reason why this Baluch connection has been essentially buried from public view. # Ramzi Yousef was arrested in Islamabad in February 1995, following his aborted attempt to bomb 12 U.S. airliners. The New York Times\'s John Burns reported then, long before this issue became so thoroughly politicized: # The Pakistan newspaper, The News, which is said to have good sources in the Pakistani military\'s Inter-Services Intelligence agency, said that " if features could betray geography

In [6]:
model_checkpoint = "bert-base-uncased"
batch_size = 32

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [13]:
def preprocess_function(examples):
    return tokenizer(examples['sentence'], truncation=True)

In [14]:
encoded_dataset = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/6516 [00:00<?, ? examples/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
num_labels = 10
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
model_name = model_checkpoint.split("/")[-1]
metric_name = 'accuracy'
args = TrainingArguments(
    f"{model_name}-finetuned-dst_clf",
    per_device_train_batch_size=batch_size,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    metric_for_best_model =metric_name,
    learning_rate=2e-5
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()