In [79]:
import pandas as pd
import numpy as np
from pathlib import Path
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import re
from tqdm.auto import tqdm

# Impor dari Hugging Face Transformers
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_scheduler, DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
from seqeval.metrics import f1_score, classification_report

In [80]:
# === Fase 1: Pengaturan dan Fungsi Bantuan ===

# Tentukan path ke direktori data Anda
# Ganti dengan path yang sesuai di komputer Anda
try:
    # Path untuk lingkungan lokal
    DATA_DIR = Path("D:/Jupyter_File/DML_Kelompok-B/Dataset")
    TRAIN_XML_DIR = DATA_DIR / "train/XML"
    TEST_XML_DIR = DATA_DIR / "test/XML"
    TRAIN_PDF_DIR = DATA_DIR / "train/PDF"
    TEST_PDF_DIR = DATA_DIR / "test/PDF"
    TRAIN_LABELS_CSV = DATA_DIR / "train_labels.csv"
    SAMPLE_SUBMISSION_CSV = DATA_DIR / "sample_submission.csv"
except Exception:
    # Path fallback jika di lingkungan lain (misalnya, Kaggle)
    DATA_DIR = Path("./")
    TRAIN_XML_DIR = DATA_DIR / "train/XML"
    TEST_XML_DIR = DATA_DIR / "test/XML"
    TRAIN_PDF_DIR = DATA_DIR / "train/PDF"
    TEST_PDF_DIR = DATA_DIR / "test/PDF"
    TRAIN_LABELS_CSV = DATA_DIR / "train_labels.csv"
    SAMPLE_SUBMISSION_CSV = DATA_DIR / "sample_submission.csv"

In [81]:
#  Definisikan label dan mappingnya
# O: Outside (bukan entitas)
# B-PRIMARY: Beginning of a primary reference
# I-PRIMARY: Inside of a primary reference
# B-SECONDARY: Beginning of a secondary reference
# I-SECONDARY: Inside of a secondary reference
id2label = {0: "O", 1: "B-PRIMARY", 2: "I-PRIMARY", 3: "B-SECONDARY", 4: "I-SECONDARY"}
label2id = {"O": 0, "B-PRIMARY": 1, "I-PRIMARY": 2, "B-SECONDARY": 3, "I-SECONDARY": 4}
LABELS = list(label2id.keys())

# Model yang direkomendasikan untuk teks ilmiah
MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased"

def clean_text(text):
    """Membersihkan teks dengan menghapus spasi berlebih."""
    text = re.sub('\s+', ' ', text).strip()
    return text

def extract_text(article_id, is_train=True):
    """
    Mengekstrak teks lengkap dari file XML atau PDF.
    Memprioritaskan XML karena lebih terstruktur.
    """
    text = ""
    if is_train:
        xml_path = TRAIN_XML_DIR / f"{article_id}.xml"
        pdf_path = TRAIN_PDF_DIR / f"{article_id}.pdf"
    else:
        xml_path = TEST_XML_DIR / f"{article_id}.xml"
        pdf_path = TEST_PDF_DIR / f"{article_id}.pdf"

    # Prioritas 1: Ekstrak dari XML
    if xml_path.exists():
        try:
            with open(xml_path, 'r', encoding='utf-8') as f:
                # Menggunakan parser 'lxml' secara eksplisit
                soup = BeautifulSoup(f, 'lxml')
            
            # Cari tag body, jika tidak ada, ambil semua teks
            body = soup.find('body')
            if body:
                text = body.get_text(separator=' ', strip=True)
            else:
                text = soup.get_text(separator=' ', strip=True)
            
            if len(text) > 50: # Cek apakah ekstraksi berhasil
                return clean_text(text)
        except Exception as e:
            print(f"Error reading XML {article_id}: {e}")
            text = "" # Reset teks jika gagal

    # Prioritas 2: Ekstrak dari PDF jika XML gagal atau tidak ada
    if pdf_path.exists():
        try:
            with fitz.open(pdf_path) as doc:
                text = "".join(page.get_text() for page in doc)
            return clean_text(text)
        except Exception as e:
            print(f"Error reading PDF {article_id}: {e}")
            return "" # Kembalikan string kosong jika gagal
            
    return ""

In [82]:
# === Fase 2: Prapemrosesan Data untuk NER ===

def find_sublist(main_list, sub_list):
    """Menemukan indeks awal dari sublist di dalam list utama."""
    for i in range(len(main_list) - len(sub_list) + 1):
        if main_list[i:i+len(sub_list)] == sub_list:
            return i
    return -1

def create_ner_dataset(labels_df):
    """
    Mengubah dataframe label menjadi format yang siap untuk pelatihan NER.
    Ini adalah langkah paling penting.
    """
    processed_data = []
    total_labels_in_csv = 0
    found_labels_in_text = 0
    
    # Kelompokkan berdasarkan artikel untuk memproses semua label per artikel
    for article_id, group in tqdm(labels_df.groupby('article_id')):
        full_text = extract_text(article_id, is_train=True)
        if not full_text:
            continue

        text_tokens = full_text.split()
        labels = ['O'] * len(text_tokens)
        
        for _, row in group.iterrows():
            dataset_id = str(row['dataset_id'])
            label_type = str(row['type'])

            if label_type.lower() not in ['primary', 'secondary']:
                continue
            
            total_labels_in_csv += 1
            
            # --- PERBAIKAN: Pencocokan berbasis token ---
            label_tokens = dataset_id.split()
            
            start_idx = find_sublist(text_tokens, label_tokens)
            
            if start_idx != -1:
                found_labels_in_text += 1
                end_idx = start_idx + len(label_tokens)
                
                # Terapkan label IOB
                labels[start_idx] = "B-" + label_type.upper()
                for i in range(start_idx + 1, end_idx):
                    labels[i] = "I-" + label_type.upper()

        processed_data.append({
            "id": article_id,
            "tokens": text_tokens,
            "ner_tags": [label2id[l] for l in labels]
        })
    
    print(f"\nRingkasan Pencocokan Label: Berhasil menemukan {found_labels_in_text} dari {total_labels_in_csv} label di dalam teks artikel.")
    return pd.DataFrame(processed_data)

In [83]:
# === Fase 3: Pelatihan Model ===

def tokenize_and_align_labels(examples, tokenizer):
    """
    Menyelaraskan label IOB dengan sub-token yang dihasilkan oleh tokenizer.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=512)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def train_model(train_dataset, val_dataset):
    """
    Fungsi untuk melatih model NER.
    """
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    
    train_tokenized = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    val_tokenized = val_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    train_dataloader = DataLoader(
        train_tokenized.remove_columns(['tokens', 'ner_tags', 'id']), 
        batch_size=8, 
        shuffle=True,
        collate_fn=data_collator
    )
    val_dataloader = DataLoader(
        val_tokenized.remove_columns(['tokens', 'ner_tags', 'id']), 
        batch_size=8,
        collate_fn=data_collator
    )

    model = AutoModelForTokenClassification.from_pretrained(
        MODEL_CHECKPOINT,
        id2label=id2label,
        label2id=label2id
    )
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=3e-5) # Sedikit menaikkan learning rate
    num_epochs = 3 # Menaikkan jumlah epoch
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    for epoch in range(num_epochs):
        model.train()
        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        model.eval()
        all_predictions = []
        all_true_labels = []
        for batch in tqdm(val_dataloader, desc=f"Validation Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            
            predictions = outputs.logits.argmax(dim=-1)
            true_labels = batch["labels"]
            
            for pred, true in zip(predictions, true_labels):
                pred_labels = [id2label[p.item()] for p, l in zip(pred, true) if l != -100]
                true_labels_list = [id2label[l.item()] for l in true if l != -100]
                all_predictions.append(pred_labels)
                all_true_labels.append(true_labels_list)
        
        has_entities = any(label != 'O' for seq in all_true_labels for label in seq)

        if has_entities:
            f1 = f1_score(all_true_labels, all_predictions)
            print(f"Epoch {epoch+1}, Validation F1-Score: {f1:.4f}")
            print(classification_report(all_true_labels, all_predictions, zero_division=0))
        else:
            print(f"Epoch {epoch+1}: Tidak ada entitas berlabel yang ditemukan di set validasi. Melewatkan laporan klasifikasi.")
            f1 = 0.0 
            print(f"Epoch {epoch+1}, Validation F1-Score: {f1:.4f}")

    model.save_pretrained("./mdc_ner_model")
    tokenizer.save_pretrained("./mdc_ner_model")
    print("Model berhasil disimpan di folder './mdc_ner_model'")
    return model, tokenizer

In [84]:
# === Fase 4: Prediksi dan Pembuatan Submission ===

def predict_on_test_data(model, tokenizer):
    """
    Membuat prediksi pada data uji dan menghasilkan file submission.
    """
    test_ids = [p.stem for p in TEST_XML_DIR.glob("*.xml")]
    if not test_ids:
        test_ids = [p.stem for p in TEST_PDF_DIR.glob("*.pdf")]

    all_rows = []

    for article_id in tqdm(test_ids, desc="Predicting on test set"):
        text = extract_text(article_id, is_train=False)
        if not text:
            continue

        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, return_offsets_mapping=True)
        offset_mapping = inputs.pop("offset_mapping").squeeze().tolist()
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            logits = model(**inputs).logits
        
        predicted_token_class_ids = logits.argmax(-1).squeeze().tolist()
        
        current_entity_tokens = []
        current_label = "O"
        last_end_offset = 0

        for token_id, offset in zip(predicted_token_class_ids, offset_mapping):
            label = id2label[token_id]
            start, end = offset
            
            if start == end: continue

            if label.startswith("B-"):
                if current_entity_tokens:
                    entity_type = current_label.split('-')[1].capitalize()
                    all_rows.append({
                        "article_id": article_id,
                        "dataset_id": text[current_entity_tokens[0][1]:last_end_offset],
                        "type": entity_type
                    })
                current_entity_tokens = [(token_id, start, end)]
                current_label = label
                last_end_offset = end
            elif label.startswith("I-") and current_label != "O" and label.split('-')[1] == current_label.split('-')[1]:
                current_entity_tokens.append((token_id, start, end))
                last_end_offset = end
            else:
                if current_entity_tokens:
                    entity_type = current_label.split('-')[1].capitalize()
                    all_rows.append({
                        "article_id": article_id,
                        "dataset_id": text[current_entity_tokens[0][1]:last_end_offset],
                        "type": entity_type
                    })
                current_entity_tokens = []
                current_label = "O"

        if current_entity_tokens:
            entity_type = current_label.split('-')[1].capitalize()
            all_rows.append({
                "article_id": article_id,
                "dataset_id": text[current_entity_tokens[0][1]:last_end_offset],
                "type": entity_type
            })

    if not all_rows:
        submission_df = pd.DataFrame(columns=['row_id', 'article_id', 'dataset_id', 'type'])
    else:
        submission_df = pd.DataFrame(all_rows)
        submission_df.insert(0, 'row_id', range(len(submission_df)))

    submission_df = submission_df[['row_id', 'article_id', 'dataset_id', 'type']]
    submission_df.to_csv("submission.csv", index=False)
    print("File submission.csv berhasil dibuat.")

In [85]:
# === Main Execution ===
if __name__ == '__main__':
    # 1. Muat label training
    print("Memuat label training...")
    train_labels_df = pd.read_csv(TRAIN_LABELS_CSV)
    
    # Filter hanya baris yang memiliki label, bukan 'Missing'
    labeled_df = train_labels_df[train_labels_df['type'] != 'Missing'].copy()

    # 2. Buat dataset NER
    print("Membuat dataset NER dari file teks...")
    ner_data = create_ner_dataset(labeled_df)
    
    if ner_data.empty:
        print("Tidak ada data yang bisa diproses. Pastikan path ke dataset sudah benar.")
    else:
        # 3. Bagi data menjadi train dan validation
        print("Membagi data menjadi set training dan validasi...")
        dataset = Dataset.from_pandas(ner_data)
        train_val_split = dataset.train_test_split(test_size=0.2, seed=42)
        
        # Buat DatasetDict
        mdc_dataset = DatasetDict({
            'train': train_val_split['train'],
            'validation': train_val_split['test']
        })
        print("Struktur dataset:")
        print(mdc_dataset)

        # 4. Latih model
        print("\nMemulai pelatihan model...")
        # Catatan: Langkah ini bisa memakan waktu lama dan membutuhkan GPU untuk performa terbaik.
        trained_model, tokenizer = train_model(mdc_dataset['train'], mdc_dataset['validation'])

        # 5. Buat prediksi
        print("\nMembuat prediksi pada data uji...")
        predict_on_test_data(trained_model, tokenizer)

Memuat label training...
Membuat dataset NER dari file teks...


  0%|          | 0/214 [00:00<?, ?it/s]

Error reading XML 10.1002_2017jc013030: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_ece3.4466: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_ece3.5260: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_ece3.6144: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_ece3.6303: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_ece3.9627: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_ecs2.4619: Couldn't find a tree builder with the features you requested: lxml. Do you need to i



Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Epoch 1:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Epoch 1:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 1: Tidak ada entitas berlabel yang ditemukan di set validasi. Melewatkan laporan klasifikasi.
Epoch 1, Validation F1-Score: 0.0000


Training Epoch 2:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Epoch 2:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 2: Tidak ada entitas berlabel yang ditemukan di set validasi. Melewatkan laporan klasifikasi.
Epoch 2, Validation F1-Score: 0.0000


Training Epoch 3:   0%|          | 0/22 [00:00<?, ?it/s]

Validation Epoch 3:   0%|          | 0/6 [00:00<?, ?it/s]

Epoch 3: Tidak ada entitas berlabel yang ditemukan di set validasi. Melewatkan laporan klasifikasi.
Epoch 3, Validation F1-Score: 0.0000
Model berhasil disimpan di folder './mdc_ner_model'

Membuat prediksi pada data uji...


Predicting on test set:   0%|          | 0/25 [00:00<?, ?it/s]

Error reading XML 10.1002_2017jc013030: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_anie.201916483: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_anie.202005531: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_anie.202007717: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_chem.201902131: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_chem.201903120: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Error reading XML 10.1002_chem.202000235: Couldn't find a tree builder with the features you req