## Imports

Importing necessary packages to perform certain preprocessing tasks

In [1]:
import pandas as pd
import re
import os
import requests
import scipdf

# Download pdfs

In [2]:
def download_pdf(url: str, title: str, storage_path=os.path.join(os.getcwd(), "data", "pdfs")):
    """
    This function is used to download research paper's pdf and store it in the storage path
    passed in as an argument

    Args:
        url (str): Research paper PDF file
        
        title (str): This is the file name which will be used while storing
        
        storage_path (str, optional): The path where the pdf file will be stored. 
        Defaults to os.path.join(os.getcwd(), "data", "pdfs").
    """
    response = requests.get(url, stream=True)
    file_path = os.path.join(storage_path, f"{title}.pdf")
    with open(file_path, 'wb') as fd:
        fd.write(response.content)

In [3]:
# reading the csv file which was generated by pdf_data_collect.py
csv_path = "./ner_task_pdf_links.csv"
df = pd.read_csv(csv_path)
# filtering out the columns which are important
df = df.loc[:, ["title", "pdf_url"]]
df.head()

Unnamed: 0,title,pdf_url
0,A Survey on Model Compression for Natural Lang...,http://arxiv.org/pdf/2202.07105v1
1,Noisy Text Data: Achilles' Heel of popular tra...,http://arxiv.org/pdf/2110.03353v1
2,Improving the robustness and accuracy of biome...,http://arxiv.org/pdf/2111.08529v1
3,Automated essay scoring using efficient transf...,http://arxiv.org/pdf/2102.13136v1
4,Annotating the Tweebank Corpus on Named Entity...,http://arxiv.org/pdf/2201.07281v2


In [4]:
# iterating over all rows and downloading the research paper PDF's
for _, row in df.iterrows():
    download_pdf(url=row["pdf_url"], title=row["title"])

# Domain Data Prepation - Parsing PDFs

In [5]:
def remove_url_regex():
    """
    Regex to remove url

    Returns:
        str: regex which can be used to remove urls from string content
    """
    regex = r"https?:\/\/[^\s]+"
    return regex

def remove_reference_num_regex():
    """
    Regex to remove paper references in a research paper

    Returns:
        str: regex which can be used to remove references eg. [1,2] or [1]
    """
    regex = r"\[[\d,]+\]"
    return regex

def parse_and_clean_pdf(file):
    """
    Accepts research paper pdf file path and performs parsing and cleaning
    In order for this function to work, the prerequisite is to make sure the
    grobid server is running (bash serve_grobid.sh)

    Once the server is running the function called the scipdf parser to extract
    string content from each section in the research paper which further undergoes
    certain cleaning steps - removing urls, removing reference nums, stripping 
    unwanted white spaces

    Args:
        file (str): research pdf file path

    Returns:
        str: parsed and cleaned pdf string content
    """
    file_content = scipdf.parse_pdf_to_dict(file)
    res = []
    res.append(file_content["abstract"])
    for section in file_content["sections"]:
        text = re.sub(remove_url_regex(), "", section["text"])
        text = text.replace("\n", " ").replace("\r", "")
        text = re.sub(remove_reference_num_regex(), "", text)
        res.append(text.strip())
    res = "\n".join(res)
    return res

In [7]:
def process_pdf_data(directory):
    """
    This function accepts a valid directory path which contains all the pdf files
    It then iterates over the research pdf files in the directory and calls the
    "parse_and_clean_pdf" function to extract paper contents as singular file where
    paragraphs are separated by a new line

    Args:
        directory (str): research pdfs directory path
    """
    pdf_text_path = os.path.join(directory, "pdf_text")
    
    def write_content(title, content):
        """
        Store a research paper's content after parsing in the path specified by
        "pdf_text_path"

        Args:
            title (str): research paper name
            content (str): research paper content
        """
        txt_file_path = os.path.join(pdf_text_path, f"{title}.txt")
        with open(txt_file_path, 'w') as fd:
            fd.write(content)

    files = os.listdir(directory)
    files = [file for file in files if file.endswith(".pdf")]
    files = sorted(files)
    for file in files:
        content = parse_and_clean_pdf(os.path.join(directory, file))
        filename, _ = os.path.splitext(os.path.basename(file))
        write_content(filename, content)

# calling process_pdf_data with the directory that contains all pdfs
process_pdf_data(os.path.join(os.getcwd(), "data", "pdfs"))