# Import Libraries

In [24]:
import pdfplumber
import re
import os
import pandas as pd
import logging
import pytesseract
from pdf2image import convert_from_path
import datetime

logging.getLogger("pdfminer").setLevel(logging.ERROR)

# Input Folder/Files

In [25]:
FOLDER_PATH = r"C:/Users/LENOVO/Downloads/Kumpulan_CV"
OUTPUT_FILE = "Rekap_Data_Pelamar.xlsx"

# Keyword Database

In [26]:
SKILL_DB = [
    "Python", "Java", "C++", "Go", "JavaScript", "TypeScript", "PHP", "SQL",
    "Machine Learning", "Deep Learning", "Data Analysis",
    "TensorFlow", "PyTorch", "Pandas", "NumPy", "Scikit-learn",
    "NLP", "OpenCV", "Android", "Flutter",
    "React", "Vue", "Angular", "Node.js",
    "Django", "Flask", "FastAPI",
    "Git", "Docker", "Kubernetes",
    "AWS", "GCP", "Azure",
    "Excel", "Tableau"
]

UNI_KEYWORDS = [
    "UNIVERSITY", "UNIVERSITAS", "INSTITUTE", "INSTITUT",
    "POLITEKNIK", "POLYTECHNIC", "ACADEMY", "AKADEMI",
    "SCHOOL", "COLLEGE", "SEKOLAH"
]

BLACKLIST_WORDS = [
    "DELIVERED", "MANAGED", "DEVELOPED", "CREATED",
    "EXPERIENCE", "WORK", "PROJECT", "ACTIVITY",
    "SUPPORTING", "BUILT", "USING", "RESPONSIBLE",
    "SKILLS", "SUMMARY", "COLLABORATED", "LED",
    "DESIGNED", "ACHIEVED", "AWARDED", "PRODUCTION"
]


# Logics for 

In [None]:
def extract_name_from_header(page):
    try:
        width, height = page.width, page.height
        header = page.within_bbox((0, 0, width, height * 0.25))
        chars = [c for c in header.chars if c["text"].strip()]

        if not chars:
            return "Unknown"

        max_size = max(c["size"] for c in chars)
        name_chars = [c["text"] for c in chars if abs(c["size"] - max_size) < 0.5]

        return re.sub(r"\s+", " ", "".join(name_chars)).strip()

    except Exception:
        return "Unknown"


In [28]:
def extract_education(text):
    """
    Mencari kampus dan IPK dengan filter ketat
    agar tidak tercampur deskripsi pekerjaan.
    """
    kampus_list = []
    ipk = "-"

    gpa_match = re.search(r"(?:GPA|IPK)\s*[:]?\s*(\d\.\d{1,2})", text, re.I)
    gpa_slash = re.search(r"(\d\.\d{1,2})\s*/\s*4\.00", text)

    value = None
    if gpa_match:
        value = float(gpa_match.group(1))
    elif gpa_slash:
        value = float(gpa_slash.group(1))

    if value and 2.0 <= value <= 4.0:
        ipk = str(value)

    for line in text.split("\n"):
        clean = line.strip()
        upper = clean.upper()

        if not (4 <= len(clean) <= 60):
            continue

        if any(k in upper for k in UNI_KEYWORDS) and not any(b in upper for b in BLACKLIST_WORDS):
            kampus, jurusan = clean, ""

            if "|" in clean:
                a, b = map(str.strip, clean.split("|", 1))
                kampus, jurusan = (a, b) if any(k in a.upper() for k in UNI_KEYWORDS) else (b, a)

            elif " - " in clean:
                a, b = map(str.strip, clean.split(" - ", 1))
                if any(k in a.upper() for k in UNI_KEYWORDS):
                    kampus = a

            entry = f"{kampus} ({jurusan})" if jurusan and len(jurusan) < 40 else kampus
            if entry not in kampus_list:
                kampus_list.append(entry)

    return {
        "Kampus": "; ".join(kampus_list) if kampus_list else "-",
        "IPK": ipk
    }


# Import to Excel File

In [29]:
data_pelamar = []

pdf_files = [
    f for f in os.listdir(FOLDER_PATH)
    if f.lower().endswith(".pdf")
]

for filename in pdf_files:
    path = os.path.join(FOLDER_PATH, filename)

    try:
        with pdfplumber.open(path) as pdf:
            name = extract_name_from_header(pdf.pages[0])

            full_text = "\n".join(
                filter(None, (p.extract_text() for p in pdf.pages))
            )

            if not full_text.strip():
                images = convert_from_path(path)
                full_text = "\n".join(pytesseract.image_to_string(img) for img in images)

            email = re.search(
                r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,10}",
                full_text
            )
            phone = re.search(
                r"[\+\(]?(?:62|0)[0-9\-\(\)\s]{8,20}",
                full_text
            )

            edu = extract_education(full_text)

            text_lower = full_text.lower()
            skills = sorted({
                s for s in SKILL_DB
                if re.search(rf"\b{s.lower()}\b", text_lower)
            })

            data_pelamar.append({
                "Nama": name,
                "Email": email.group(0) if email else "-",
                "HP": phone.group(0).strip() if phone else "-",
                "Kampus": edu["Kampus"],
                "IPK": edu["IPK"],
                "Skills": ", ".join(skills) if skills else "-",
                "Nama File": filename
            })

    except Exception:
        continue

In [30]:
if data_pelamar:
    df = pd.DataFrame(data_pelamar)
    df = df[["Nama", "Email", "HP", "Kampus", "IPK", "Skills", "Nama File"]]

    output_path = os.path.join(FOLDER_PATH, OUTPUT_FILE)
    df.to_excel(output_path, index=False)