# Imports

In [3]:
import numpy
import pandas as pd
import re
import os
import requests
import scipdf

# Initializing directories for Data Storage

In [7]:
!mkdir pdfs

In [7]:
!mkdir -p ./data/bevani_data/pdf_text

# Download pdfs

In [21]:
def download_pdf(url: str, title: str, storage_path="./pdfs"):
    response = requests.get(url, stream=True)
    file_path = os.path.join(storage_path, f"{title}.pdf")
    with open(file_path, 'wb') as fd:
        fd.write(response.content)

In [2]:
csv_path = "./ner_task_data.csv"
df = pd.read_csv(csv_path)
df = df.loc[:, ["title", "pdf_url"]]
df.head()

Unnamed: 0,title,pdf_url
0,A Survey on Model Compression for Natural Lang...,http://arxiv.org/pdf/2202.07105v1
1,Noisy Text Data: Achilles' Heel of popular tra...,http://arxiv.org/pdf/2110.03353v1
2,Improving the robustness and accuracy of biome...,http://arxiv.org/pdf/2111.08529v1
3,Automated essay scoring using efficient transf...,http://arxiv.org/pdf/2102.13136v1
4,Annotating the Tweebank Corpus on Named Entity...,http://arxiv.org/pdf/2201.07281v2


In [22]:
for _, row in df.iterrows():
    download_pdf(url=row["pdf_url"], title=row["title"])

# Domain Data Prepation - Parsing PDFs

In [19]:
def remove_url_regex():
    regex = r"https?:\/\/[^\s]+"
    return regex

def remove_reference_num_regex():
    regex = r"\[\d+\]"
    return regex

def parse_and_clean_pdf(file):
    file_content = scipdf.parse_pdf_to_dict(file)
    res = []
    res.append(file_content["abstract"])
    for section in file_content["sections"]:
        text = re.sub(remove_url_regex(), "", section["text"])
        text = text.replace("\n", " ").replace("\r", "")
        text = re.sub(remove_reference_num_regex(), "", text)
        res.append(text)
    res = " ".join(res)
    return res

In [20]:
def process_pdf_data(directory):
    pdf_text_path = os.path.join(directory, "pdf_text")
    def write_content(title, content):
        txt_file_path = os.path.join(pdf_text_path, f"{title}.txt")
        with open(txt_file_path, 'w') as fd:
            fd.write(content)

    files = os.listdir(directory)
    files = [file for file in files if file.endswith(".pdf")]
    files = sorted(files)
    for file in files:
        content = parse_and_clean_pdf(os.path.join(directory, file))
        filename, _ = os.path.splitext(os.path.basename(file))
        write_content(filename, content)
        
process_pdf_data("./data/bevani_data")

