In [2]:
from google.colab import drive
import os
drive.mount('/content/drive')
base_path = "/content/drive/MyDrive/4475"
articles_path = os.path.join(base_path, 'makaleler-yazarlar')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import re
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.nn import Dropout, Linear, CrossEntropyLoss
from torch.optim import AdamW

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"Device name: {torch.cuda.get_device_name(0)}")

Using device: cuda
Device name: Tesla T4


# Read Data

In [5]:
data = []
for author in os.listdir(articles_path):
    author_folder = os.path.join(articles_path, author)
    if os.path.isdir(author_folder):
        for file_name in os.listdir(author_folder):
            file_path = os.path.join(author_folder, file_name)
            with open(file_path, 'r', encoding='ISO-8859-9') as f:
                text = f.read()
                data.append({"author": author, "text": text})

df = pd.DataFrame(data)
df['author_label'] = df['author'].astype('category').cat.codes
print(f"Loaded {len(data)} articles.")

Loaded 1500 articles.


## Clean Text(just spaces)

In [6]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Gereksiz boşlukları temizle
    return text

df['cleaned_text'] = df['text'].apply(clean_text)


## Tokenizer

In [7]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

# Dataset

In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        return {key: val.squeeze(0) for key, val in encoding.items()}, torch.tensor(self.labels[idx])


## Model

In [9]:
def create_model(num_labels):
    model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=num_labels)
    model.classifier = torch.nn.Sequential(
        Dropout(0.4),
        Linear(model.config.hidden_size, num_labels)
    )
    return model.to(device)

In [10]:
# BERT modeli
model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=len(df['author_label'].unique()))

# Dropout ekle
from torch.nn import Dropout, Linear
num_labels = len(df['author_label'].unique())
model.classifier = torch.nn.Sequential(
    Dropout(0.3),  # Dropout
    Linear(model.config.hidden_size, num_labels)
)

# Modeli GPU'ya taşı
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model Dropout ile güncellendi.")



model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Dropout ile güncellendi.


## Train Model

In [11]:
def train_model(model, dataloader, optimizer, scheduler, loss_fn, num_epochs=10, patience=3):
    best_loss = float('inf')
    no_improvement = 0

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch in dataloader:
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            epoch_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch + 1}: Loss {avg_loss:.4f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            no_improvement = 0
            print("Model improved. Saving model...")
            model.save_pretrained(f"{base_path}/bert_model_fold_{fold + 1}")
        else:
            no_improvement += 1
            if no_improvement >= patience:
                print("Early stopping triggered.")
                break

In [12]:
def evaluate_model(model, dataloader):
    model.eval()
    all_labels, all_preds = [], []

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    return all_labels, all_preds

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_reports = []

for fold, (train_idx, test_idx) in enumerate(kf.split(df)):
    print(f"Fold {fold + 1}")
    train_texts, test_texts = df.iloc[train_idx]['cleaned_text'], df.iloc[test_idx]['cleaned_text']
    train_labels, test_labels = df.iloc[train_idx]['author_label'], df.iloc[test_idx]['author_label']

    train_dataset = TextDataset(train_texts.tolist(), train_labels.tolist())
    test_dataset = TextDataset(test_texts.tolist(), test_labels.tolist())
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=8)

    model = create_model(num_labels=len(df['author_label'].unique()))
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_fn = CrossEntropyLoss()
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=10 * len(train_dataloader))

    train_model(model, train_dataloader, optimizer, scheduler, loss_fn)
    all_labels, all_preds = evaluate_model(model, test_dataloader)

    report = classification_report(all_labels, all_preds, output_dict=True)
    all_reports.append(report)

Fold 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Loss 3.1147
Model improved. Saving model...
Epoch 2: Loss 1.5356
Model improved. Saving model...
Epoch 3: Loss 0.5225
Model improved. Saving model...
Epoch 4: Loss 0.2050
Model improved. Saving model...
Epoch 5: Loss 0.0839
Model improved. Saving model...
Epoch 6: Loss 0.0527
Model improved. Saving model...
Epoch 7: Loss 0.0391
Model improved. Saving model...
Epoch 8: Loss 0.0327
Model improved. Saving model...
Epoch 9: Loss 0.0298
Model improved. Saving model...
Epoch 10: Loss 0.0274
Model improved. Saving model...
Fold 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Loss 3.0331
Model improved. Saving model...
Epoch 2: Loss 1.3798
Model improved. Saving model...
Epoch 3: Loss 0.5212
Model improved. Saving model...
Epoch 4: Loss 0.1924
Model improved. Saving model...
Epoch 5: Loss 0.0848
Model improved. Saving model...
Epoch 6: Loss 0.0505
Model improved. Saving model...
Epoch 7: Loss 0.0360
Model improved. Saving model...
Epoch 8: Loss 0.0300
Model improved. Saving model...
Epoch 9: Loss 0.0267
Model improved. Saving model...
Epoch 10: Loss 0.0254
Model improved. Saving model...
Fold 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Loss 3.0620
Model improved. Saving model...
Epoch 2: Loss 1.5594
Model improved. Saving model...
Epoch 3: Loss 0.5809
Model improved. Saving model...
Epoch 4: Loss 0.2331
Model improved. Saving model...
Epoch 5: Loss 0.1011
Model improved. Saving model...
Epoch 6: Loss 0.0561
Model improved. Saving model...
Epoch 7: Loss 0.0407
Model improved. Saving model...
Epoch 8: Loss 0.0345
Model improved. Saving model...
Epoch 9: Loss 0.0300
Model improved. Saving model...
Epoch 10: Loss 0.0276
Model improved. Saving model...
Fold 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Loss 3.1513
Model improved. Saving model...
Epoch 2: Loss 1.5710
Model improved. Saving model...
Epoch 3: Loss 0.5871
Model improved. Saving model...
Epoch 4: Loss 0.2347
Model improved. Saving model...
Epoch 5: Loss 0.1151
Model improved. Saving model...
Epoch 6: Loss 0.0641
Model improved. Saving model...
Epoch 7: Loss 0.0409
Model improved. Saving model...
Epoch 8: Loss 0.0334
Model improved. Saving model...
Epoch 9: Loss 0.0306
Model improved. Saving model...
Epoch 10: Loss 0.0288
Model improved. Saving model...
Fold 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Loss 2.8975
Model improved. Saving model...
Epoch 2: Loss 1.4885
Model improved. Saving model...
Epoch 3: Loss 0.5812
Model improved. Saving model...
Epoch 4: Loss 0.1908
Model improved. Saving model...
Epoch 5: Loss 0.0850
Model improved. Saving model...
Epoch 6: Loss 0.0527
Model improved. Saving model...
Epoch 7: Loss 0.0392
Model improved. Saving model...
Epoch 8: Loss 0.0321
Model improved. Saving model...
Epoch 9: Loss 0.0290
Model improved. Saving model...
Epoch 10: Loss 0.0271
Model improved. Saving model...


In [14]:
# Ortalamaları hesapla
fold_reports = []

for report in all_reports:
    df_report = pd.DataFrame(report).transpose()
    relevant_scores = df_report.loc[[str(i) for i in range(len(df['author_label'].unique()))] + ['macro avg', 'weighted avg'], ['precision', 'recall', 'f1-score']]
    fold_reports.append(relevant_scores)

average_report = pd.concat(fold_reports).groupby(level=0).mean()

# Ortalamaları yazdır ve kaydet
os.makedirs(f"{base_path}/performance_results", exist_ok=True)
print("Cross-validation average results:")
print(average_report)
average_report.to_csv(f"{base_path}/performance_results/cross_validation_mean_results.csv")
print(f"Results saved to {base_path}/performance_results/cross_validation_mean_results.csv")

# Metrikleri transpoze et ve göster
print("\nTransposed metrics (metrics as rows, classes as columns):")
df_report_t = average_report[['precision', 'recall', 'f1-score']].transpose()
print(df_report_t)

# Transpoze edilmiş metrikleri kaydet
df_report_t.to_csv(f"{base_path}/performance_results/cross_validation_transposed_results.csv")
print(f"Transposed results saved to {base_path}/performance_results/cross_validation_transposed_results.csv")

Cross-validation average results:
              precision    recall  f1-score
0              0.967532  0.973333  0.969787
1              0.852381  0.888039  0.864253
10             0.973214  0.981818  0.976617
11             0.879638  1.000000  0.932184
12             1.000000  1.000000  1.000000
13             1.000000  1.000000  1.000000
14             0.772143  0.901984  0.830187
15             0.946263  0.910000  0.921021
16             0.985714  0.957778  0.970302
17             0.934615  0.894017  0.909901
18             1.000000  0.984615  0.992000
19             0.966667  0.926667  0.943720
2              0.944444  0.798088  0.843815
20             0.933822  0.863095  0.894498
21             1.000000  0.971429  0.984615
22             0.886984  0.846429  0.857749
23             0.971429  1.000000  0.984615
24             0.950000  0.971429  0.960000
25             1.000000  1.000000  1.000000
26             0.950000  1.000000  0.973333
27             1.000000  0.953247  0.97509

          weighted avg  
precision      0.947855  
recall         0.941333  
f1-score       0.940442  