In [1]:
import os
# 1) Uyarıları kapatmak için
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

import os
from lib.database.database_connector import DatabaseConnector
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertModel, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1) Ayarlar
MODEL_NAME    = "google/bert_uncased_L-2_H-128_A-2"  # Çok küçük BERT
MAX_LEN       = 512
BATCH_SIZE    = 1
NUM_EPOCHS    = 3
DEVICE        = torch.device("cpu")#torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# 2) Tokenizer ve Tiny BERT
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
tinybert  = BertModel.from_pretrained(MODEL_NAME).to(DEVICE)
tinybert.gradient_checkpointing_enable()  # Bellek tasarrufu

In [4]:
for name, param in tinybert.named_parameters():
    print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [5]:
 # 3) Veri tabanından çek
db          = DatabaseConnector("./data/database.db")
cursor      = db.cursor
train_data  = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date < '2020-01-01';"
).fetchall()
test_data   = cursor.execute(
    "SELECT sentences, label FROM embeddings WHERE filing_date >= '2020-01-01';"
).fetchall()

In [6]:
# 4) Dataset: rapor başına tüm chunk’lar
class LongTextDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.samples = []
        for sents, label in tqdm(data, desc="Veri hazırlanıyor", unit="örnek"):
            text = " ".join(sents)
            enc  = tokenizer(
                text,
                truncation=True,
                max_length=max_len,
                return_overflowing_tokens=True,
                padding=False
            )
            self.samples.append({
                "input_ids": enc.input_ids,
                "attention_mask": enc.attention_mask,
                "label": label
            })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [7]:
# 5) collate_fn
def collate_fn(batch):
    B = len(batch)
    C = max(len(x["input_ids"]) for x in batch)
    S = MAX_LEN

    input_ids      = torch.full((B, C, S), tokenizer.pad_token_id, dtype=torch.long)
    attention_mask = torch.zeros((B, C, S), dtype=torch.long)
    labels         = torch.tensor([x["label"] for x in batch], dtype=torch.long)

    for i, sample in enumerate(batch):
        for j, (ids, mask) in enumerate(zip(sample["input_ids"], sample["attention_mask"])):
            L = len(ids)
            input_ids[i, j, :L]      = torch.tensor(ids)
            attention_mask[i, j, :L] = torch.tensor(mask)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [8]:

# 6) DataLoader
train_ds      = LongTextDataset(train_data, tokenizer, MAX_LEN)
test_ds       = LongTextDataset(test_data,  tokenizer, MAX_LEN)
train_loader  = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                           collate_fn=collate_fn, num_workers=0, pin_memory=True)
test_loader   = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False,
                           collate_fn=collate_fn, num_workers=0, pin_memory=True)

Veri hazırlanıyor: 100%|██████████| 7878/7878 [10:49<00:00, 12.13örnek/s]
Veri hazırlanıyor: 100%|██████████| 1027/1027 [01:36<00:00, 10.65örnek/s]


In [9]:
# 7) Model: Hierarchical DistilBERT + LSTM + FC
class HierarchicalDistilBERT(nn.Module):
    def __init__(self, distilbert, lstm_hidden=128, num_labels=2):
        super().__init__()
        self.distilbert = distilbert
        self.lstm       = nn.LSTM(
            input_size=distilbert.config.hidden_size,
            hidden_size=lstm_hidden,
            batch_first=True,
            bidirectional=True
        )
        self.classifier = nn.Linear(2 * lstm_hidden, num_labels)

    def forward(self, input_ids, attention_mask):
        B, C, S = input_ids.size()
        flat_ids  = input_ids.view(B*C, S)
        flat_mask = attention_mask.view(B*C, S)

        outputs = self.distilbert(input_ids=flat_ids, attention_mask=flat_mask)
        pooled  = outputs.last_hidden_state[:, 0, :]    # [CLS] token embedding

        chunk_embs = pooled.view(B, C, -1)              # [B, C, H]
        lstm_out, _ = self.lstm(chunk_embs)             # [B, C, 2*H_lstm]
        doc_emb     = lstm_out[:, -1, :]                # son zaman adımı

        return self.classifier(doc_emb)                 # [B, num_labels]

In [10]:
model     = HierarchicalDistilBERT(tinybert).to(DEVICE)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()



In [11]:
# 8) Eğitim Döngüsü
for epoch in range(NUM_EPOCHS):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}", unit="batch")
    for batch in loop:
        ids    = batch["input_ids"].to(DEVICE)
        mask   = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(ids, mask)
        loss   = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=f"{loss.item():.4f}")


Epoch 1/3: 100%|██████████| 7878/7878 [9:08:00<00:00,  4.17s/batch, loss=1.6031]   
Epoch 2/3: 100%|██████████| 7878/7878 [8:21:09<00:00,  3.82s/batch, loss=0.1846]   
Epoch 3/3: 100%|██████████| 7878/7878 [7:34:37<00:00,  3.46s/batch, loss=0.1629]   


In [12]:
# 9) Test ve Metrikler
model.eval()
all_labels, all_preds = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test", unit="batch"):
        ids    = batch["input_ids"].to(DEVICE)
        mask   = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        logits = model(ids, mask)
        preds  = torch.argmax(logits, dim=1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

Test: 100%|██████████| 1027/1027 [05:36<00:00,  3.05batch/s]


In [13]:
acc, prec, rec, f1 = (
    accuracy_score(all_labels, all_preds),
    precision_score(all_labels, all_preds),
    recall_score(all_labels, all_preds),
    f1_score(all_labels, all_preds),
)
tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1:        {f1:.3f}")
print(f"TP:        {tp}")
print(f"TN:        {tn}")
print(f"FP:        {fp}")
print(f"FN:        {fn}")

Accuracy:  0.853
Precision: 0.000
Recall:    0.000
F1:        0.000
TP:        0
TN:        876
FP:        0
FN:        151
