<a href="https://colab.research.google.com/github/09eesx/MindWatch_PsyRiskAnalyzer/blob/main/amodel_train_berts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets scikit-learn --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from tqdm import tqdm
import torch.nn as nn
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Veriyi yükle ve etiketleri dönüştür
df = pd.read_csv("/content/drive/MyDrive/data/train_cleaned_augamented_cleaned.csv")
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])

# Oversampling uygula
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(df[["text"]], df["label_id"])
resampled_df = pd.DataFrame({"text": X_resampled["text"], "label_id": y_resampled})

# Tokenizer hazırla
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
num_labels = df["label_id"].nunique()

# Tokenization fonksiyonu
def tokenize(example):
    encoding = tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)
    encoding["labels"] = int(example["label_id"])
    return encoding

# Stratified K-Fold ayarı
k_folds = 2
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Fold skorları tutmak için listeler
fold_accuracies = []
fold_losses = []
fold_precisions = []
fold_recalls = []
fold_f1s = []

# Stratified K-Fold Eğitim
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(resampled_df["text"], resampled_df["label_id"])):
    print(f"\n🔵 Fold {fold_idx+1} Başlıyor...")

    # Fold'a özel train/val ayır
    train_df = resampled_df.iloc[train_idx]
    val_df = resampled_df.iloc[val_idx]

    # Fold'a özel Class Weights hesapla
    class_weights = compute_class_weight(class_weight="balanced",
                                         classes=np.unique(train_df["label_id"]),
                                         y=train_df["label_id"])
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

    # Huggingface Dataset formatı
    train_dataset = Dataset.from_pandas(train_df[["text", "label_id"]])
    val_dataset = Dataset.from_pandas(val_df[["text", "label_id"]])

    tokenized_train = train_dataset.map(tokenize)
    tokenized_val = val_dataset.map(tokenize)

    tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    # DataLoader
    train_dataloader = DataLoader(tokenized_train, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(tokenized_val, batch_size=16)

    # Modeli yeniden başlat
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=num_labels
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    total_steps = len(train_dataloader) * 4
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
    )

    loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    # 📍 Early Stopping Ayarları
    best_acc = 0
    best_precision = 0
    best_recall = 0
    best_loss = float('inf')
    best_f1 = 0
    patience = 2
    patience_counter = 0

    # 📚 Eğitim
    for epoch in range(2):
        model.train()
        total_loss = 0
        loop = tqdm(train_dataloader, desc=f"Fold {fold_idx+1} Epoch {epoch+1}")

        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop("labels")

            outputs = model(**batch)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        # 📉 Validation
        model.eval()
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop("labels")
                outputs = model(**batch)
                preds = torch.argmax(outputs.logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        acc = accuracy_score(all_labels, all_preds)

        print(f"✅ Fold {fold_idx+1} Epoch {epoch+1} - Accuracy: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

        # 🔥 En iyi modeli kaydet
        if f1 > best_f1:
            best_f1 = f1
            best_acc = acc
            best_precision = precision
            best_recall = recall
            best_loss = total_loss / len(train_dataloader)
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"⏹️ Fold {fold_idx+1} için early stopping tetiklendi! Epoch: {epoch+1}")
            break

    # 📥 Fold sonu EN İYİ SONUÇLARI kaydet
    fold_accuracies.append(best_acc)
    fold_losses.append(best_loss)
    fold_precisions.append(best_precision)
    fold_recalls.append(best_recall)
    fold_f1s.append(best_f1)

# 🎯 Eğitim Bittikten Sonra Sonuçları Yazdır
print("\n🎯 Stratified K-Fold Sonuçları:")

for i in range(len(fold_accuracies)):
    print(f"Fold {i+1} - Accuracy: {fold_accuracies[i]:.4f} | Precision: {fold_precisions[i]:.4f} | Recall: {fold_recalls[i]:.4f} | F1: {fold_f1s[i]:.4f} | Training Loss: {fold_losses[i]:.4f}")

print("\n📊 Ortalama Sonuçlar:")
print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")
print(f"Average Precision: {np.mean(fold_precisions):.4f}")
print(f"Average Recall: {np.mean(fold_recalls):.4f}")
print(f"Average F1: {np.mean(fold_f1s):.4f}")
print(f"Average Training Loss: {np.mean(fold_losses):.4f}")

# 11. Modeli Kaydet
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
save_path = f"/content/drive/MyDrive/data/bert_model_{timestamp}"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Model başarıyla kaydedildi: {save_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


🔵 Fold 1 Başlıyor...


Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 4375/4375 [12:32<00:00,  5.81it/s, loss=0.444]


✅ Fold 1 Epoch 1 - Accuracy: 0.7852 | Precision: 0.7857 | Recall: 0.7852 | F1: 0.7815


Fold 1 Epoch 2: 100%|██████████| 4375/4375 [12:31<00:00,  5.82it/s, loss=0.39]


✅ Fold 1 Epoch 2 - Accuracy: 0.8168 | Precision: 0.8202 | Recall: 0.8168 | F1: 0.8142

🔵 Fold 2 Başlıyor...


Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 2 Epoch 1: 100%|██████████| 4375/4375 [12:31<00:00,  5.82it/s, loss=0.47]


✅ Fold 2 Epoch 1 - Accuracy: 0.7747 | Precision: 0.7810 | Recall: 0.7747 | F1: 0.7661


Fold 2 Epoch 2: 100%|██████████| 4375/4375 [12:31<00:00,  5.82it/s, loss=0.434]


✅ Fold 2 Epoch 2 - Accuracy: 0.8165 | Precision: 0.8207 | Recall: 0.8165 | F1: 0.8160

🎯 Stratified K-Fold Sonuçları:
Fold 1 - Accuracy: 0.8168 | Precision: 0.8202 | Recall: 0.8168 | F1: 0.8142 | Training Loss: 0.4617
Fold 2 - Accuracy: 0.8165 | Precision: 0.8207 | Recall: 0.8165 | F1: 0.8160 | Training Loss: 0.4619

📊 Ortalama Sonuçlar:
Average Accuracy: 0.8166
Average Precision: 0.8205
Average Recall: 0.8166
Average F1: 0.8151
Average Training Loss: 0.4618
✅ Model başarıyla kaydedildi: /content/drive/MyDrive/data/bert_model_20250428_0728
