In [None]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader
from transformers import AdamW,get_cosine_with_hard_restarts_schedule_with_warmup
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
def get_dataset():
    df = pd.read_csv(r"/content/data.csv")
    text = df['text'].values.tolist()
    label = df['label'].values.tolist()
    return text, label

def CleanTxt(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text=  re.sub(r'bit.ly/\S+', '', text)
    text=re.sub(r'\s\s+', ' ', text)
    text=re.sub(r'(\A\s+|\s+\Z)', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)  # web siteleri için
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    text=re.sub(r'(.)\1+', r'\1\1', text)#Ardışık olarak tekrar eden karakterleri iki kez tekrarlayan karakterlerle değiştirir. Örneğin, "oooo" yerine "oo" yapar.
    text = text.replace('I','ı').lower()
    return text

text, label = get_dataset()

cleaned_text = [CleanTxt(tweet) for tweet in text]

In [None]:
def get_input_parameters(tokenizer_name,x_train,x_test,y_train,y_test,batch_size):
  tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
  #tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

  # Metinleri token'lara dönüştürme
  input_ids_train = []
  attention_masks_train = []
  input_ids_test = []
  attention_masks_test = []
  for text in x_train:
      encoded_dict_train = tokenizer.encode_plus(
                          text,
                          add_special_tokens = True,
                          pad_to_max_length = True,
                          max_length=128,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )
      input_ids_train.append(encoded_dict_train['input_ids'])
      attention_masks_train.append(encoded_dict_train['attention_mask'])

  for text in x_test:
      encoded_dict_test = tokenizer.encode_plus(
                          text,
                          add_special_tokens = True,
                          pad_to_max_length = True,
                          max_length=128,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )
      input_ids_test.append(encoded_dict_test['input_ids'])
      attention_masks_test.append(encoded_dict_test['attention_mask'])

  input_ids_train = torch.cat(input_ids_train, dim=0)
  attention_masks_train = torch.cat(attention_masks_train, dim=0)
  labels_train = torch.tensor(y_train)

  input_ids_test = torch.cat(input_ids_test, dim=0)
  attention_masks_test = torch.cat(attention_masks_test, dim=0)
  labels_test = torch.tensor(y_test)

  dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
  dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

  train_dataloader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
  validation_dataloader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

  return train_dataloader,validation_dataloader

In [None]:
def GetModel(model_name,learning_rate,epochs,train_dataloader):
  model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
  )
  #BertForSequenceClassification
  # GPU kullanılabilirse
  model.to(device)

  # Optimizer ve scheduler
  optimizer = AdamW(model.parameters(), lr = learning_rate)

  total_steps = len(train_dataloader) * epochs

  scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer,
                                              num_warmup_steps = 0,
                                              num_cycles=epochs,
                                              num_training_steps = total_steps)
  return model,optimizer,scheduler

In [None]:
def validate(model, validation_dataloader, device):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    all_preds = []
    all_labels = []
    misclassified_indices=[]
    for i, batch in enumerate(validation_dataloader):
        # Veri işleme ve modelin performansını ölçme adımı
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        correct_preds = (preds == b_labels).float().sum().item()
        total_eval_accuracy += correct_preds

        all_preds.extend(preds.cpu().detach().numpy())
        all_labels.extend(b_labels.cpu().detach().numpy())
        misclassified_indices.extend(
            [i * validation_dataloader.batch_size + j for j, (pred, label) in enumerate(zip(preds, b_labels)) if pred != label]
        )

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)

    precision, recall, f_score, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary')

    return avg_val_loss, avg_val_accuracy, precision, recall, f_score,misclassified_indices

In [None]:
def Train_Bert(model,optimizer,scheduler,train_dataloader,epochs,validation_dataloader):
  for epoch_i in range(0, epochs):

    model.train()

    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()

        outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)

        loss = outputs.loss

        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    print(f"Epoch {epoch_i + 1} / {epochs} - Average training loss: {avg_train_loss}")
    val_loss, val_accuracy,precision, recall, f_score,misclassified_indices = validate(model, validation_dataloader, device)
    print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy},Validation f1-score: {f_score}")
  return model,f_score,misclassified_indices

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cleaned_text, label, test_size=0.2, random_state=42)

In [None]:
train_dataloader,validation_dataloader=get_input_parameters("ytu-ce-cosmos/turkish-base-bert-uncased",X_train,X_test,y_train,y_test,32)

In [None]:
model,optimizer,scheduler=GetModel("ytu-ce-cosmos/turkish-base-bert-uncased",2e-5,2,train_dataloader)

In [None]:
model,f_score,misclassified_indices=Train_Bert(model,optimizer,scheduler,train_dataloader,2,validation_dataloader)


Epoch 1 / 2 - Average training loss: 0.24276778167465105
Validation Loss: 0.20646962201539076, Validation Accuracy: 0.9234669811320755,Validation f1-score: 0.9157033380958567
Epoch 2 / 2 - Average training loss: 0.1871942377693178
Validation Loss: 0.2014674870051303, Validation Accuracy: 0.9264150943396227,Validation f1-score: 0.9200614911606457


In [None]:
print(len(X_test),len(misclassified_indices))


8480 624


In [None]:
for index in misclassified_indices:
  print(X_test[index])
  print(y_test[index])

kızılay sana bundan sonra 1 damla kan da vermem bağış da vermem.yazıklar olsun..
1
koskoca milli takımın teknik dörektörünün ağzından çıkanlara bak! sonra da milli maçlara herkesi bekliyoruz, kenetlenelim birleşelim. siz önce üstünüzdeki formaları çıkarın sonra taraftarları eleştirin.
1
bunun için miydi tüm uğraş yazık verilen oylara yazık
1
yazık biz senden bizi kollamanı beklemiyoruz adil olmanı hakkaniyetli olmanı istiyoruz .
1
anadolu ajansına soda gönderelim hazmedemediği sonucu açıklamıyor galiba #seçim2019
1
fırsatı olan herkes izlemeli şu an belediye meclisi toplantısını. chpliler çok güzel adamlarsınız be! sıfır nefret diliyle hala açıklama yapıyorlar mis gibi. özlediğimiz hareketler bunlar
0
ayarları bozan adam her şey çok güzel olacak
0
farkettiniz mi hakan çalhanoğlunun yokluğu çok belli oldu çünkü iyi oynadık
1
belki arkadaş benim gibi şizofren 😂
1
bana sevgili degilher şeyi beraber yapabilecegim manyak bi insan lazım,
1
bende bunu anlamıyorun adamı bu kadar gözünde büyüte