## Import Libraries

In [2]:
from PyPDF2 import PdfReader

import os
import json
import spacy

## Single CV Simulation

For NER fine-tuning BERT model purpose, we need CoNLL format as input training data

This section code is how BIO tagging flow from extract pdf into saving the BIO schemas into **CoNLL** format.

1. Extract all CV file format into .txt.
2. Using [Doccano](!https://github.com/doccano/doccano) library for manual Entity tagging.
3. Extract jsonl response and format into CoNLL BIO tagged schemas

In [None]:
reader = PdfReader("cv-example.pdf")
with open("output-example.txt", "w", encoding="utf-8") as f:
    for page in reader.pages:
        f.write(page.extract_text() + "\n")

In [3]:
with open("tag-example.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)

In [4]:
data

{'id': 1,
 'text': 'Phone\nEmail\n082367903648\nderrickvericho@gmail.com Project LinkLinkedIn\nhttps://github.com/DerrickVericho\nhttps://www.linkedin.com/in/derrick-\nvericho-268563225/Derrick Vericho\nUNDERGRADUATE DATA SCIENCE\nSTUDENT\nI’m a 5th-semester undergraduate student with a strong passion for data science and artificial intelligence, particularly\nin building solutions that connect technology with real-world needs. My interest lies in combining data-driven insights\nwith the financial industry, exploring how AI can create smarter and more impactful services. Through various projects, I\nhave developed skills in data analysis, machine learning, and system development, while continuously seeking\nchallenges that push me to grow. ABOUT ME\nSecond author & Corresponding author on published paper “Vision Transformer and CNNs in Kidney Stone\nClassification: A Comparative Study” acceptance on ICCSCIACTIVITIES\nActivist Data Science Club: Research DivisionSKILL\nRelevant Course: 

In [6]:
nlp = spacy.load("en_core_web_sm")

def jsonl_to_conll(jsonl_file, output_file):
    """Conver JSONL into ConLL format"""
    with open(jsonl_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    with open(output_file, "w", encoding="utf-8") as out:
        for entry in data:
            text = entry["text"] # Get the text from doc
            labels = entry.get("label", []) # Get the labels from doc
            
            doc = nlp(text) # Exract the doc into token
            tags = ["O"] * len(doc) # Initialize tags

            # Get position and label
            for start, end, label in labels:
                # Get each token id and token
                for token_id, token in enumerate(doc):
                    # Set Begin and Inside Label if the label is available
                    if token.idx >= start and token.idx < end:
                        prefix = "B-" if token.idx == start else "I-"
                        tags[token_id] = prefix + label

            for token, tag in zip(doc, tags):
                out.write(f"{token.text} {tag}\n")
            out.write("\n")

jsonl_to_conll("tag-example.jsonl", "tag-example.conll")

## CV PDF Tagging Processing

### Extract PDF into txt format

In [11]:
input_folder = "CV_pdf"
output_folder = "CV_txt"

# Loop semua file di folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".pdf"):
        pdf_path = os.path.join(input_folder, file_name)
        txt_path = os.path.join(output_folder, file_name.replace(".pdf", ".txt"))

        # Baca PDF
        reader = PdfReader(pdf_path)
        with open(txt_path, "w", encoding="utf-8") as f:
            for page in reader.pages:
                f.write(page.extract_text() + "\n")

### Extract tagging jsonl BIO format into CoNLL

There are some doccano issue, where the output format may labelled with:
- Array of arrays -> [start_offset, end_offset, "LABEL"]
- Self descriptive format (with dictionary) -> {"entity": [{id, "LABEL", start, end}]}

In [25]:
# Set current id to name the output file
current_id = 1

In [None]:
# Use this for array of arrays
# Set the id with your prefered start number
nlp = spacy.load("en_core_web_sm")

def jsonl_to_conll(jsonl_file, output_dir, current_id=current_id):
    """Conver JSONL into ConLL format"""
    with open(jsonl_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    for entry in data:
        text = entry["text"] # Get the text from doc
        labels = entry.get("label", []) # Get the labels from doc
        
        doc = nlp(text) # Exract the doc into token
        tags = ["O"] * len(doc) # Initialize tags

        # Get position and label
        for start, end, label in labels:
            # Get each token id and token
            for token_id, token in enumerate(doc):
                # Set Begin and Inside Label if the label is available
                if token.idx >= start and token.idx < end:
                    prefix = "B-" if token.idx == start else "I-"`
                    tags[token_id] = prefix + label

        output_file = os.path.join(output_dir, f"CV_{current_id}.conll")
        current_id+=1
        
        with open(output_file, "a", encoding="utf-8") as out:
            for token, tag in zip(doc, tags):
                out.write(f"{token.text} {tag}\n")
            out.write("\n")

jsonl_to_conll("tagged_jsonl/all.jsonl", "tagged_conll")

In [59]:
# Use this for self decriptive format
# Set the id with your prefered start number
nlp = spacy.load("en_core_web_sm")

def jsonl_to_conll(jsonl_file, output_dir, current_id=current_id):
    """Conver JSONL into ConLL format"""
    with open(jsonl_file, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]

    for entry in data:
        text = entry["text"] # Get the text from doc
        entities = entry.get("entities", []) # Get the labels from doc
        
        doc = nlp(text) # Exract the doc into token
        tags = ["O"] * len(doc) # Initialize tags

        # Get position and label
        for ent in entities:
            start, end, label = ent["start_offset"], ent["end_offset"], ent["label"]
            for token_id, token in enumerate(doc):
                if token.idx >= start and token.idx < end:
                    prefix = "B-" if token.idx == start else "I-"
                    tags[token_id] = prefix + label

        output_file = os.path.join(output_dir, f"CV_{current_id}.conll")
        current_id+=1
        
        with open(output_file, "a", encoding="utf-8") as out:
            for token, tag in zip(doc, tags):
                out.write(f"{token.text} {tag}\n")
            out.write("\n")

jsonl_to_conll("tagged_jsonl/all.jsonl", "tagged_conll")