<a href="https://colab.research.google.com/github/EfeIlhan22/My-Projects/blob/NLP/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Bölüm 1: Verilerin Yüklenmesi ve Hazırlanması
import os
!pip install datasets
!pip install transformers scikit-learn torch pandas tqdm
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tqdm import tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Verilerin Yüklenmesi
data = []
labels = []

# Ana klasör yolu
main_dir = "/content/drive/MyDrive/makaleler-yazarlar"

# Her label için dosyaları gez
for label, folder in enumerate(os.listdir(main_dir)):
    folder_path = os.path.join(main_dir, folder)
    if os.path.isdir(folder_path):
        for txt_file in os.listdir(folder_path):
            if txt_file.endswith(".txt"):
                file_path = os.path.join(folder_path, txt_file)
                with open(file_path, 'r', encoding='ISO-8859-9') as file:
                    text = file.read().strip()
                    data.append(text)
                    labels.append(label)

# Verileri DataFrame'e dönüştür
df = pd.DataFrame({"text": data, "label": labels})


In [3]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

In [8]:
# Bölüm 2: Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# BERTürk Tokenizer ve Model
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Bölüm 3: Eğitim ve Değerlendirme Fonksiyonları
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# GPU kullanımı kontrolü
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

results = []

In [28]:
from sklearn.metrics import classification_report
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model, dataloader, device, class_names):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Detaylı metrik raporu
    report = classification_report(
        all_labels,
        all_preds,
        target_names=class_names,
        output_dict=True,
        zero_division=0  # Sıfır bölme hatasını önle
    )

    # DataFrame'e dönüştürme
    df_report = pd.DataFrame(report).transpose()

    # İstenen formatta tablo oluşturma
    performance_df = pd.DataFrame(columns=["Class 1", "Class 2", "...", "Class 29", "Class 30", "Average"])

    # Her sınıf için değerleri doldurma
    for i in range(len(class_names)):
        class_name = f"Class {i+1}"
        performance_df.loc["Precision", class_name] = df_report.loc[class_name, "precision"]
        performance_df.loc["Recall", class_name] = df_report.loc[class_name, "recall"]
        performance_df.loc["F-Score", class_name] = df_report.loc[class_name, "f1-score"]

    # Ortalama değerleri ekleme
    performance_df["Average"] = [
        df_report.loc["macro avg", "precision"],
        df_report.loc["macro avg", "recall"],
        df_report.loc["macro avg", "f1-score"]
    ]

    accuracy = accuracy_score(all_labels, all_preds)
    return performance_df.round(4), accuracy  # İki değer döndür

In [30]:
def main():
    # Parametreler
    BATCH_SIZE = 16
    EPOCHS = 10
    MAX_LENGTH = 128
    LR = 5e-5
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    fold_accuracies = []

    texts = df["text"].tolist()
    labels = df["label"].tolist()

    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        print(f"\nFold {fold + 1}")

        # Verileri ayırma
        train_texts = [texts[i] for i in train_idx]
        train_labels = [labels[i] for i in train_idx]
        val_texts = [texts[i] for i in val_idx]
        val_labels = [labels[i] for i in val_idx]

        train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
        val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

        # Model ve optimizer tanımlama
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))
        model.to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=LR)

        # Eğitim döngüsü
        for epoch in range(EPOCHS):
            print(f"\nEpoch {epoch + 1}/{EPOCHS}")
            train_loss = train(model, train_loader, optimizer, DEVICE)
            print(f"Training Loss: {train_loss}")

        class_names = [f"Class {i+1}" for i in range(len(set(labels)))]

        # Değerlendirme
        performance_df, accuracy = evaluate(model, val_loader, DEVICE, class_names)
        print(f"Fold {fold + 1} Accuracy: {accuracy}")
        print(f"\\nFold {fold + 1} Performance Report:")
        print(performance_df.to_markdown())
        fold_accuracies.append(accuracy)

    # Sonuçlar
    print(f"\nAverage Accuracy: {sum(fold_accuracies) / len(fold_accuracies)}")


if __name__ == "__main__":
    main()


Fold 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 3.1624766540527345

Epoch 2/10


Training: 100%|██████████| 75/75 [00:37<00:00,  2.01it/s]


Training Loss: 1.8925074593226114

Epoch 3/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 1.0096226676305136

Epoch 4/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.44735352536042533

Epoch 5/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.17448540012041727

Epoch 6/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.99it/s]


Training Loss: 0.08575300777951876

Epoch 7/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.07459539656837781

Epoch 8/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.03265005134046078

Epoch 9/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.023335045178731282

Epoch 10/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.98it/s]


Training Loss: 0.0208489145586888


Evaluating: 100%|██████████| 19/19 [00:05<00:00,  3.71it/s]


Fold 1 Accuracy: 0.8433333333333334
\nFold 1 Performance Report:
|           |   Class 1 |   Class 2 |   ... |   Class 29 |   Class 30 |   Average |   Class 3 |   Class 4 |   Class 5 |   Class 6 |   Class 7 |   Class 8 |   Class 9 |   Class 10 |   Class 11 |   Class 12 |   Class 13 |   Class 14 |   Class 15 |   Class 16 |   Class 17 |   Class 18 |   Class 19 |   Class 20 |   Class 21 |   Class 22 |   Class 23 |   Class 24 |   Class 25 |   Class 26 |   Class 27 |   Class 28 |
|:----------|----------:|----------:|------:|-----------:|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|
| Precision |  0.818182 |       0.6 |   nan |          1 |   0.75     |    0.855  |    0.6667 |    0.7273

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 2.918170134226481

Epoch 2/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.99it/s]


Training Loss: 1.5027977959314982

Epoch 3/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.98it/s]


Training Loss: 0.677611571153005

Epoch 4/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.2662425084908803

Epoch 5/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.12044364030162494

Epoch 6/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.05789564847946167

Epoch 7/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.98it/s]


Training Loss: 0.03308805048465729

Epoch 8/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.024071048175295193

Epoch 9/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.01924826775987943

Epoch 10/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.015859468368192513


Evaluating: 100%|██████████| 19/19 [00:04<00:00,  4.23it/s]


Fold 2 Accuracy: 0.8466666666666667
\nFold 2 Performance Report:
|           |   Class 1 |   Class 2 |   ... |   Class 29 |   Class 30 |   Average |   Class 3 |   Class 4 |   Class 5 |   Class 6 |   Class 7 |   Class 8 |   Class 9 |   Class 10 |   Class 11 |   Class 12 |   Class 13 |   Class 14 |   Class 15 |   Class 16 |   Class 17 |   Class 18 |   Class 19 |   Class 20 |   Class 21 |   Class 22 |   Class 23 |   Class 24 |   Class 25 |   Class 26 |   Class 27 |   Class 28 |
|:----------|----------:|----------:|------:|-----------:|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|
| Precision |  0.833333 |  0.857143 |   nan |   0.769231 |        0.7 |    0.8594 |    0.75   |    1     

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 3.2052367369333905

Epoch 2/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 1.7871306864420573

Epoch 3/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.99it/s]


Training Loss: 0.8029444003105164

Epoch 4/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.32992253283659617

Epoch 5/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.16548511882623038

Epoch 6/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.08718067198991776

Epoch 7/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.03858494537572066

Epoch 8/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.98it/s]


Training Loss: 0.025712057625253994

Epoch 9/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.95it/s]


Training Loss: 0.02055179428309202

Epoch 10/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.016844436054428417


Evaluating: 100%|██████████| 19/19 [00:04<00:00,  4.24it/s]


Fold 3 Accuracy: 0.85
\nFold 3 Performance Report:
|           |   Class 1 |   Class 2 |   ... |   Class 29 |   Class 30 |   Average |   Class 3 |   Class 4 |   Class 5 |   Class 6 |   Class 7 |   Class 8 |   Class 9 |   Class 10 |   Class 11 |   Class 12 |   Class 13 |   Class 14 |   Class 15 |   Class 16 |   Class 17 |   Class 18 |   Class 19 |   Class 20 |   Class 21 |   Class 22 |   Class 23 |   Class 24 |   Class 25 |   Class 26 |   Class 27 |   Class 28 |
|:----------|----------:|----------:|------:|-----------:|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|
| Precision |  0.692308 |  0.571429 |   nan |   0.75     |        0.7 |    0.8629 |       0.7 |    0.8182 |    0.8333 |

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 3.0893622493743895

Epoch 2/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.95it/s]


Training Loss: 1.6629571032524109

Epoch 3/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.8591389807065328

Epoch 4/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.98it/s]


Training Loss: 0.37683771232763924

Epoch 5/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.95it/s]


Training Loss: 0.16665976077318193

Epoch 6/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.95it/s]


Training Loss: 0.10316832462946574

Epoch 7/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.95it/s]


Training Loss: 0.050289692531029384

Epoch 8/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.98it/s]


Training Loss: 0.030771365885933242

Epoch 9/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.02225400355954965

Epoch 10/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.95it/s]


Training Loss: 0.018091195411980152


Evaluating: 100%|██████████| 19/19 [00:04<00:00,  4.05it/s]


Fold 4 Accuracy: 0.86
\nFold 4 Performance Report:
|           |   Class 1 |   Class 2 |   ... |   Class 29 |   Class 30 |   Average |   Class 3 |   Class 4 |   Class 5 |   Class 6 |   Class 7 |   Class 8 |   Class 9 |   Class 10 |   Class 11 |   Class 12 |   Class 13 |   Class 14 |   Class 15 |   Class 16 |   Class 17 |   Class 18 |   Class 19 |   Class 20 |   Class 21 |   Class 22 |   Class 23 |   Class 24 |   Class 25 |   Class 26 |   Class 27 |   Class 28 |
|:----------|----------:|----------:|------:|-----------:|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|
| Precision |         1 |  0.666667 |   nan |   0.615385 |       1    |    0.8698 |    0.7143 |    0.7778 |         1 |

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 3.39869966506958

Epoch 2/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.95it/s]


Training Loss: 2.202005575497945

Epoch 3/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 1.0678974016507468

Epoch 4/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.47772255261739094

Epoch 5/10


Training: 100%|██████████| 75/75 [00:37<00:00,  1.98it/s]


Training Loss: 0.23019403596719107

Epoch 6/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.09537855063875517

Epoch 7/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.051612428824106854

Epoch 8/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.96it/s]


Training Loss: 0.07348038929204147

Epoch 9/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.02662101442615191

Epoch 10/10


Training: 100%|██████████| 75/75 [00:38<00:00,  1.97it/s]


Training Loss: 0.019606179719169933


Evaluating: 100%|██████████| 19/19 [00:05<00:00,  3.40it/s]


Fold 5 Accuracy: 0.8733333333333333
\nFold 5 Performance Report:
|           |   Class 1 |   Class 2 |   ... |   Class 29 |   Class 30 |   Average |   Class 3 |   Class 4 |   Class 5 |   Class 6 |   Class 7 |   Class 8 |   Class 9 |   Class 10 |   Class 11 |   Class 12 |   Class 13 |   Class 14 |   Class 15 |   Class 16 |   Class 17 |   Class 18 |   Class 19 |   Class 20 |   Class 21 |   Class 22 |   Class 23 |   Class 24 |   Class 25 |   Class 26 |   Class 27 |   Class 28 |
|:----------|----------:|----------:|------:|-----------:|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|-----------:|
| Precision |  0.888889 |  0.727273 |   nan |          1 |   0.888889 |    0.8806 |    0.6429 |         1