# Turn the pdf into a tsv file

Install the dependencies

In [None]:
pip install tabula-py 

Use tabula to extract the PDF tables to TSV, a much simple file format

In [2]:
import tabula

tabula.convert_into(
    "data/publication-2022.pdf", # Input file
    "data/publication.tsv", # Output file
    output_format="tsv", 
    pages="all",
    area=[5, 0, 95, 100], # [ TOP, LEFT, BOTTOM, RIGHT ]
    relative_area=True,
    stream=True,
    guess=False

)

Manually fixing up the generated file, since it's not yet perfect (or usable)

In [3]:
import re

file_contents = ""

# Read the file
with open("data/publication.tsv", "r") as file_reader:
    file_contents = file_reader.read()

# Fix the whitespace
for a, b in [
    (r'""', ''),
    (r'[ \t]+', r'\t'),
    (r'\t+\n', r'\n'),
    (r'\n\t+', '\n')
]: file_contents = re.sub(a, b, file_contents)

# Write back to the file
with open("data/publication.tsv", "w") as file_writer:
    file_writer.write(file_contents)

In [9]:
def read_competition_scores(file_name: str):
    data = []

    with open(file_name, "r") as file_reader:
        current_school = ''
        reset_school = True

        for line in file_reader.readlines():
            line = line.strip().split("\t")

            for col in line:
                if not col.replace(".", "").isnumeric():
                    if line[0].isnumeric():
                        if reset_school:
                            current_school = ""
                            reset_school = False

                        current_school += " " + " ".join(line)

                    break
            else:
                if not reset_school:
                    # NOTE: This can be cleaned up quite a lot

                    current_school = re.sub(r'^.*?\s-\s(.*)', r'\1', current_school)
                    # print(current_school)
                    school_id = re.findall(r'\d{5,}', current_school)[-1]
                    current_school = current_school.replace(f' {school_id} ', '\t')
                    current_school = f'{school_id}\t{current_school}'

                    reset_school = True

                competition_score = float(line[-1] if '.' in line[-1] else line[-2])
                competition_id = line[1]

                data.append((competition_score, f'{competition_id}\t{current_school}'))
            
    
    return data

Now, we use the `read_competition_scores()` function to read the competition scores.

In [10]:
data = read_competition_scores("data/publication.tsv")

# Sort in descending order
sorted_data = sorted(data, key=lambda a: a[0], reverse=True)

Generate a string and write it to a file

In [13]:
output_string = "ადგილი\tსაკონკურსო ქულა\tსაგამოცდო ნომერი\tფაკულტეტის ნომერი\tუნივერსიტეტი\tფაკულტეტი\n"
for index, entry in enumerate(sorted_data):
    output_string += f"{(index+1):5d}\t{entry[0]}\t{entry[1]}\n"

with open("data/sorted-scores.tsv", "w") as file_wirter:
    file_wirter.write(output_string)