In [2]:
import re
import os
import math
import json
import string
import random
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback

# Hugging face dataset import for data loading
from datasets import Dataset

# Sklearn for metric calculations and other preporcessing tasks
from sklearn.metrics import f1_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")

2024-09-16 15:36:13.676505: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-16 15:36:13.680648: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 15:36:13.690208: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-16 15:36:13.705675: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-16 15:36:13.710244: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attemptin

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [4]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Preprocessing

In [5]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords.words('turkish')])
    # return " ".join([word for word in stopword_remover.drop_stop_words(str(text).split())])

def remove_punctuation(text):
    puncs = '’!"#$%&\'*+:;<=>?...@[\\]^_`{|}~“”'

    # Remove punctuation 
    return text.translate(str.maketrans('', '', puncs)) 

def preprocess(text):
    
    text = text.replace("\t", " ")
    text = text.replace("\n", " ")
    text = remove_punctuation(text)
    
    # text = remove_stopwords(text)
    
    # Remove digits
    text = re.sub(r'[0-9]{2}', '', text)
    remove_digits = str.maketrans('', '', string.digits)
    text = text.translate(remove_digits)
    
    text = re.sub(' +', ' ', text) # remove extra whitespaces
    text = re.sub(r'([^\w\s])\1+', r'\1', text)
    text = re.sub(r'\s?([^\w\s])\s?', ' ', text)
    text = re.sub(r'\b\w\b', '', text)
    text = re.sub(' +', ' ', text)
    return text.strip()

In [6]:
df_train = pd.read_csv("data/stance_train.csv")
df_val = pd.read_csv("data/stance_val.csv")
df_test = pd.read_csv("data/stance_test.csv")

MAX_LEN = 512

# Check how many labels are there in the dataset
unique_labels = df_train.labels.unique().tolist()

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(ids_to_labels)

df_train['labels'] = df_train.labels.apply(lambda x: labels_to_ids[x]).tolist()
df_val['labels'] = df_val.labels.apply(lambda x: labels_to_ids[x]).tolist()
df_test['labels'] = df_test.labels.apply(lambda x: labels_to_ids[x]).tolist()

df_train.shape, df_val.shape, df_test.shape

{0: 'Negative', 1: 'Neutral', 2: 'Positive'}


((6059, 2), (674, 2), (1189, 2))

In [7]:
df_train.labels.value_counts()

labels
2    2236
0    2181
1    1642
Name: count, dtype: int64

In [8]:
df_train.head()

Unnamed: 0,text,labels
0,ben bunu tam gün önce paylaştım daha yeni duym...,0
1,harita teknikerleri olarak aldığımız yüksek pu...,1
2,bakan özhaseki imar barışının detaylarını açık...,1
3,allah isinizirasgetirsin allah senden raraziolsun,2
4,sayın bakanım biz ek gösterge mağdurlarıyız ta...,1


# Check Token Coverage

* The decision to weather we shall go for a pretrained tokenizer or shall we retrain the tokenizer can be based on tokens coverage. 

In [9]:
# MODEL_NAME = 'dbmdz/bert-base-turkish-cased'
# MODEL_NAME = 'dbmdz/distilbert-base-turkish-cased'
# MODEL_NAME = 'dbmdz/convbert-base-turkish-cased'
# MODEL_NAME = 'dbmdz/electra-base-turkish-cased-discriminator'
# MODEL_NAME = 'loodos/albert-base-turkish-uncased'
# MODEL_NAME = 'burakaytan/roberta-base-turkish-uncased'
MODEL_NAME = "FacebookAI/xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, model_max_length=MAX_LEN)

total_token_count = 0
unk_token_count = 0

for index, dp in df_train.iterrows():
    
    tokenized_text = tokenizer.tokenize(str(dp['text']))
    unk_token_count += len([i for i in tokenized_text if i[0:2] == "##"])
    total_token_count += len(tokenized_text)

print (f"Percentage of tokens unknown: {(100.0 * unk_token_count/total_token_count)}")

Percentage of tokens unknown: 0.0


## Preprocessing Data

* Load data into Dataset class, and tokenize the text

In [9]:
from vnlp import Normalizer
def lower_case_func(text):
    return Normalizer.lower_case(text)

def preprocess_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length = MAX_LEN, return_tensors='pt')

if "uncased" in MODEL_NAME:
    df_train['text'] = df_train.text.apply(lower_case_func).tolist()
    df_val['text'] = df_val.text.apply(lower_case_func).tolist()
    df_test['text'] = df_test.text.apply(lower_case_func).tolist()

dataset_train = Dataset.from_pandas(df_train[["text", "labels"]], split="train")
dataset_val = Dataset.from_pandas(df_val[["text", "labels"]], split="val")
dataset_test = Dataset.from_pandas(df_test[["text", "labels"]], split="test")

train_dataset = dataset_train.map(preprocess_function, batched=True, num_proc=2, remove_columns=["text"], fn_kwargs={"tokenizer":tokenizer})
val_dataset = dataset_val.map(preprocess_function, batched=True, remove_columns=["text"], fn_kwargs={"tokenizer":tokenizer})
test_dataset = dataset_test.map(preprocess_function, batched=True, num_proc=2, remove_columns=["text"], fn_kwargs={"tokenizer":tokenizer})

2024-09-12 17:01:29.749387: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
Map (num_proc=2): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6059/6059 [00:01<00:00, 3668.82 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 674/674 [00:00<00:00, 3855.61 examples/s]
Map (num_proc=2): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1189/1189 [00:00<00:00, 1286.23 examples/s]


In [10]:
df_train.head(5)

Unnamed: 0,text,labels
0,ben bunu tam gün önce paylaştım daha yeni duym...,0
1,harita teknikerleri olarak aldığımız yüksek pu...,1
2,bakan özhaseki imar barışının detaylarını açık...,1
3,allah isinizirasgetirsin allah senden raraziolsun,2
4,sayın bakanım biz ek gösterge mağdurlarıyız ta...,1


## Train model

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                           num_labels=len(unique_labels))

EPOCH = 10
warmup_steps = math.ceil(len(train_dataset) * EPOCH * 0.1)
BATCH_SIZE = 32
LR = 5e-5
WD = 0.003

training_args = TrainingArguments(
    num_train_epochs=EPOCH,
    
    # Optimizer Hyperparameters
    optim = "adamw_torch",
    learning_rate=LR,
    weight_decay=WD,
    warmup_steps=warmup_steps,
    
    # Logging Hyperparameters
    run_name="stance-detection",
    output_dir="stance-chkpt",
    overwrite_output_dir=True,
    logging_steps=250,
    evaluation_strategy="steps",
    save_strategy="steps",
    
    # Wieght and Biases
    report_to="none",
    
    # General Hyperparameters
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    
    load_best_model_at_end=True,
    push_to_hub=False,
    save_total_limit=1,
    # gradient_checkpointing=True,
    do_train=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
trainer.train(resume_from_checkpoint=False)

Step,Training Loss,Validation Loss,Accuracy,F1
250,1.0749,0.94095,0.586053,0.461221
500,0.7793,0.623116,0.744807,0.731135
750,0.6213,0.585877,0.777448,0.766948
1000,0.5197,0.587895,0.783383,0.775567
1250,0.4534,0.605587,0.790801,0.786615
1500,0.3863,0.574718,0.793769,0.790433
1750,0.3174,0.703417,0.778932,0.76991


TrainOutput(global_step=1900, training_loss=0.5682379833020662, metrics={'train_runtime': 1903.6362, 'train_samples_per_second': 31.829, 'train_steps_per_second': 0.998, 'total_flos': 1.594204198013952e+16, 'train_loss': 0.5682379833020662, 'epoch': 10.0})

In [14]:
print(f"Results for {MODEL_NAME}")
results = trainer.evaluate(eval_dataset=test_dataset)
for key, value in results.items():
    print(f"{key} = {value}")

Results for FacebookAI/xlm-roberta-base


eval_loss = 0.5669955015182495
eval_accuracy = 0.8031959629941127
eval_f1 = 0.7986781397182857
eval_runtime = 11.6316
eval_samples_per_second = 102.222
eval_steps_per_second = 3.267
epoch = 10.0


In [15]:
# # SAVE MODEL
prefix = MODEL_NAME.replace("dbmdz/","").replace("loodos/","").replace("burakaytan/","").replace("FacebookAI/","")
save_path = f"trained-models/{prefix}"
save_path

'trained-models/xlm-roberta-base'

In [16]:
trainer.save_model(f'{save_path}')
# Save Parameters
with open(f"{save_path}/parameters.txt", "w+", encoding="utf-8") as f:
    f.write(f"MODEL NAME: {MODEL_NAME}\n")
    f.write(f"MAX LEN: {MAX_LEN}\n")
    f.write(f"EPOCH: {EPOCH}\n")
    f.write(f"BATCH SIZE: {BATCH_SIZE}\n")
    f.write(f"LR: {LR}\n")
    f.write(f"WD: {WD}\n")

In [17]:
with open(f"{save_path}/id2label.json", "w+", encoding="utf-8") as fp:
    json.dump(ids_to_labels, fp, indent=4)

with open(f"{save_path}/label2id.json", "w+", encoding="utf-8") as fp:
    json.dump(labels_to_ids, fp, indent=4)

# Evaluate

In [10]:
# device = "cpu"
def load_model_and_tokenizer(model_path, num_labels, max_len, device="cuda"):
    # LOAD MODEL AND TOKENIZER
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
    model = model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=max_len)
    return tokenizer, model

In [None]:
model_names = ['dbmdz/bert-base-turkish-cased', 'dbmdz/distilbert-base-turkish-cased', 'dbmdz/convbert-base-turkish-cased', 
               'dbmdz/electra-base-turkish-cased-discriminator', 'loodos/albert-base-turkish-uncased', 
               'burakaytan/roberta-base-turkish-uncased', 'FacebookAI/xlm-roberta-base']

for mn in model_names:
    # # SAVE MODEL
    prefix = mn.replace("dbmdz/","").replace("loodos/","").replace("burakaytan/","").replace("FacebookAI/","")
    load_path = f"trained-models/{prefix}"
    print(load_path)

    df_train = pd.read_csv("data/stance_train.csv")
    df_val = pd.read_csv("data/stance_val.csv")
    df_test = pd.read_csv("data/stance_test.csv")
    
    MAX_LEN = 512
    
    # Check how many labels are there in the dataset
    unique_labels = df_train.labels.unique().tolist()
    
    # Map each label into its id representation and vice versa
    labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
    
    df_train['labels'] = df_train.labels.apply(lambda x: labels_to_ids[x]).tolist()
    df_val['labels'] = df_val.labels.apply(lambda x: labels_to_ids[x]).tolist()
    df_test['labels'] = df_test.labels.apply(lambda x: labels_to_ids[x]).tolist()

    if "uncased" in mn:
        df_train['text'] = df_train.text.apply(lower_case_func).tolist()
        df_val['text'] = df_val.text.apply(lower_case_func).tolist()
        df_test['text'] = df_test.text.apply(lower_case_func).tolist()

    dataset_train = Dataset.from_pandas(df_train[["text", "labels"]], split="train")
    dataset_val = Dataset.from_pandas(df_val[["text", "labels"]], split="val")
    dataset_test = Dataset.from_pandas(df_test[["text", "labels"]], split="test")

    tokenizer, loaded_model = load_model_and_tokenizer(load_path, len(labels_to_ids), MAX_LEN)

    train_dataset = dataset_train.map(preprocess_function, batched=True, num_proc=2, remove_columns=["text"], fn_kwargs={"tokenizer":tokenizer})
    val_dataset = dataset_val.map(preprocess_function, batched=True, remove_columns=["text"], fn_kwargs={"tokenizer":tokenizer})
    test_dataset = dataset_test.map(preprocess_function, batched=True, num_proc=2, remove_columns=["text"], fn_kwargs={"tokenizer":tokenizer})

    trainer = Trainer(
    model=loaded_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )

    with open("./stance_detection_results", "a+") as sdr:
        sdr.write(f"Results for: {mn}\n")
        results = trainer.evaluate(eval_dataset=test_dataset)
        sdr.write(f"Accuracy = {results['eval_accuracy']:.2f}\n")
        sdr.write(f"Macro F1 = {results['eval_f1']:.2f}\n")
        sdr.write("*"*60)
        sdr.write("\n")

In [56]:
mn = 'dbmdz/electra-base-turkish-cased-discriminator'

df_train = pd.read_csv("data/stance_train.csv")
MAX_LEN = 512

# Check how many labels are there in the dataset
unique_labels = df_train.labels.unique().tolist()

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
# ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

# # SAVE MODEL
prefix = mn.replace("dbmdz/","").replace("loodos/","").replace("burakaytan/","").replace("FacebookAI/","")
load_path = f"trained-models/{prefix}"
load_path

'trained-models/electra-base-turkish-cased-discriminator'

In [57]:
tokenizer, loaded_model = load_model_and_tokenizer(load_path, len(labels_to_ids), MAX_LEN)

In [58]:
tokenizer.push_to_hub("byunal/electra-base-turkish-cased-stance")

CommitInfo(commit_url='https://huggingface.co/byunal/electra-base-turkish-cased-stance/commit/28d8c50e179747fa4a68f94a46dc9b9ccf9fe37a', commit_message='Upload tokenizer', commit_description='', oid='28d8c50e179747fa4a68f94a46dc9b9ccf9fe37a', pr_url=None, pr_revision=None, pr_num=None)

In [59]:
loaded_model.push_to_hub("byunal/electra-base-turkish-cased-stance")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/byunal/electra-base-turkish-cased-stance/commit/24e2823093c884c8484f2739fc3721a5269f4ba7', commit_message='Upload ElectraForSequenceClassification', commit_description='', oid='24e2823093c884c8484f2739fc3721a5269f4ba7', pr_url=None, pr_revision=None, pr_num=None)