In [None]:
import pdfplumber
import pandas as pd
import json
from tqdm import tqdm

In [None]:
lists = [{
    "id": 0,
    "name": "ENEM 2017, 2018 e 2019",
    "file_path": "../resources/enem.pdf"
},
{
    "id": 1,
    "name": "Vestibular UDESC 2019/2 ou 2020/1",
    "file_path": "../resources/vestibular.pdf"
},
{
    "id": 2,
    "name": "Média final geral do Ensino Médio",
    "file_path": "../resources/ensino_medio.pdf"
}]

In [None]:
def get_candidates_df(list):
    global df

    pdf = pdfplumber.open(list["file_path"])
    candidates = []

    for page in tqdm(pdf.pages, desc='tables extraction'):
        candidates.extend(page.extract_table())

    # define headers
    headers = ["id", "name", "birth_date", "grade", "situation", "course", "affirmative_action_policy"]

    df = pd.DataFrame(candidates[1:], columns=headers)

    for column in headers:
        df[column] = df[column].str.replace("\n", "")

    df["grade"] = df["grade"].str.replace(",", ".")
    df["grade"] = pd.to_numeric(df["grade"])
    df['grade'] = df['grade'].fillna(0)

    df["list_id"] = list["id"]

    return df

In [None]:
candidates = []

print("Starting to extract data...")

for list in tqdm(lists, desc='lists'):
    list_candidates = get_candidates_df(list).to_dict(orient="records")
    candidates.extend(list_candidates)

In [None]:
courses = set([x["course"] for x in candidates])
courses = [*courses]
courses = [{"id": id, "name": name} for id, name in enumerate(courses)]


In [None]:
for candidate in candidates:
    candidate["course"] = [course["id"] for course in courses if course["name"] == candidate["course"]][0]


In [None]:
CANDIDATES_OUTPUT = "../resources/output/candidates_output.json"
COURSES_OUTPUT = "../resources/output/courses_output.json"

with open(CANDIDATES_OUTPUT, "w", encoding="utf-8") as f:
    json.dump(candidates, f, ensure_ascii=False, indent=4)

with open(COURSES_OUTPUT, "w", encoding="utf-8") as f:
    json.dump(courses, f, ensure_ascii=False, indent=4)