In [1]:
from pypdf import PdfReader
from acl_anthology import Anthology
from io import BytesIO
from tqdm import tqdm
import requests
import re
import os
import sys
import json
import warnings
warnings.filterwarnings('ignore')

from utils import *

In [2]:
anthology = Anthology.from_repo()

In [3]:
# the following venues divide their main conf paper collections as ".long" and ".short" separately. need to gather them
divided_venues = ['2021.acl', '2022.acl', '2023.acl']

In [4]:
def get_max_paper_id(venue_index: str, Anthology=anthology):
    for i in range(1, 10000):
        try:
            a = anthology.get(venue_index + '.'+ str(i))
            b = anthology.get(venue_index + '.'+ str(i+1))
            assert not (a is None and b is None)
        except:
            if venue_index == '2023.acl-long': 
                # ACL 2023 has a PC's report appened as the last long paper (doesn't follow the formatting)
                # "Program Chairs’ Report on Peer Review at ACL 2023", https://aclanthology.org/2023.acl-long.911/
                return i-2
            return i-1

In [5]:
def extract_full_text_and_save(venue_index: str, divided_venues=divided_venues, enable_notes=False, direct_write=True):
    
    if not os.path.exists(f'data/{venue_index}_extracted_full.json'):
        
        if venue_index in divided_venues:
            out_col = extract_full_text_and_save(venue_index+'-long', direct_write=False) + extract_full_text_and_save(venue_index+'-short', direct_write=False)
            if direct_write:
                with open(f'data/{venue_index}_extracted_full.json', 'w', encoding='utf-8') as f2:
                    json.dump(out_col, f2, indent=2)
                return
            else:
                return out_col
        
        print(f'Extracting text for {venue_index} from scratch.')
        max_paper_id = get_max_paper_id(venue_index)
        print(f'max paper id: {max_paper_id}')
        
        out_col = []
        for j in tqdm(range(1, max_paper_id + 1)):
            if venue_index == '2020.emnlp-main' and j==401:
                continue # The page is having a 404 error. this paper might have been retracted?
            url = f'https://aclanthology.org/{venue_index}.{str(j)}.pdf'
            try:
                url_pdf_file = requests.get(url).content
                pdf_io_bytes = BytesIO(url_pdf_file)
                reader = PdfReader(pdf_io_bytes)
            except:
                print(f'Warning: Unable to fetch the PDF file of Paper {venue_index}.{str(j)}. The URL is not working and please validate. This paper will be skipped.')
                continue 
            page_labels = reader.page_labels
            text_col, section_names_col, text_by_sections = [], ['__START__'], []
            found_ref_flag = False
            
            for i in range(len(reader.page_labels)):
                page = reader.pages[i]
                label = page_labels[i]
                try:
                    text = page.extract_text()
                except:
                    print(f"Warning: Failed to extract text from {venue_index}.{str(j)} Page {i+1}. This page is skipped.")
                    continue
                text.removesuffix(label) # remove the page number
                text_col.append(text)
                
                section_pattern = re.compile(r'((\n|[a-z)\]"]\.|^)([1-9] [A-Z]|References).*[\n$])') # Don't want to include sebsections (e.g., 3.1 XXX or A.1 XXX)
                res = section_pattern.findall(text)
                for item in res:
                    # An item looks like ('\n7 Conclusion\n', '\n', '7 C') or ('\nReferences\n', '\n', 'References')
                    name, header, key = item
                    if 'et al.,' in name:
                        continue
                    if key == 'References':
                        found_ref_flag = True
                    name = name.removeprefix(header).strip()
                    section_names_col.append(name)
                    #print(name)
                
            output = '\n'.join(text_col)
            output = output.strip().split('\n') # divide the full text into each line
            
            if len(output) > 1:
                # remove the line-end hyphens and replace \n with a space
                output = ''.join([item[:-1] if len(item) > 0 and item[-1] == '-' else item+' ' for item in output])
                # remove the formatted page footer (on the first page)
                file_start_pattern = re.compile(r'Proceedings of the .* (Conference|Meeting) .*©[0-9]* Association for Computational Linguistics')
                footer_search = file_start_pattern.search(output)
                if footer_search is not None:
                    #print(main_text_start_pos)
                    footer_start_pos = footer_search.start()
                    main_text_start_pos = footer_search.end()
                    if main_text_start_pos < 5000:
                        output = output[:footer_start_pos] + output[main_text_start_pos:]
                    else:
                        print(f'Warning: File footer matching might be problematic in {venue_index}.{str(j)}. The file will still be processed and included. Please check manually.')
                else:
                    if enable_notes:
                        print(f'Note: Didn\'t find footer in {venue_index}.{str(j)}. (Usually there is no need to worry since it is often because the first page indeed doesn\'t contain the footer.) The file will still be processed and included. Please check manually.')
                # divide by section
                start_pos = 0
                for i in range(1, len(section_names_col)):
                    end_pos = output.find(section_names_col[i], start_pos)
                    text_by_sections.append((section_names_col[i-1], output[start_pos:end_pos].strip()))
                    #print(start_pos, end_pos, section_names_col[i-1])
                    start_pos = end_pos + len(section_names_col[i])
                if not found_ref_flag:
                    # Attempt to search "References " in the last chunk as the approximate start of the reference
                    last_text_chunk = output[start_pos:]
                    approx_ref_pos = last_text_chunk.find('References ')
                    if approx_ref_pos < 0:
                        approx_ref_pos = last_text_chunk.find('Acknowledgements ')
                    #print('HERE!')
                    #print(last_text_chunk)
                    if approx_ref_pos >= 0:
                        found_ref_flag = True
                        if enable_notes:
                            print(f'Note: The Reference section of Paper {venue_index}.{str(j)} is approximated by searching in the last chunk. It might be different from the exact position.')
                        text_by_sections.append((section_names_col[-1], last_text_chunk[:approx_ref_pos].strip()))
                        text_by_sections.append(("References", last_text_chunk[approx_ref_pos:].strip()))
                    else:
                        text_by_sections.append((section_names_col[-1], last_text_chunk.strip()))
                else:
                    text_by_sections.append((section_names_col[-1], output[start_pos:].strip()))
                #print(start_pos, end_pos, section_names_col[-1])
            else:
                output = ''
                print(f'Warning: empty output in Paper {venue_index}.{str(j)}; this paper will be skipped.')
                    
            if not found_ref_flag:
                print(f'Warning: didn\'t find References section in Paper {venue_index}.{str(j)}')
        
            res_dict = {'id':j, 'text':None, 'by_chapter':{}}
            if len(output):
                res_dict['text'] = output
                if not len(section_names_col):
                    print(f'Warning: Failed to recognize sections in Paper {venue_index}.{str(j)}')
                for item in text_by_sections:
                    res_dict['by_chapter'][item[0]] = item[1]
                out_col.append(res_dict)
        
        if direct_write:
            with open(f'data/{venue_index}_extracted_full.json', 'w', encoding='utf-8') as f2:
                json.dump(out_col, f2, indent=2)
            return
        else:
            return out_col

    else:
        print(f'{venue_index} has already been processed and stored.')
        if direct_write:
            return
        else:
            with open(f'data/{venue_index}_extracted_full.json', 'r', encoding='utf-8') as f3:
                out_col = json.load(f3)
            return out_col

In [6]:
venue_list = [ # Annotations here: Paper submission deadlines
    '2020.acl-main', # Dec 9, 2019
    '2020.emnlp-main', # Jun 3, 2020
    '2021.naacl-main', # Nov 23, 2020
    '2021.acl', # Feb 2, 2021
    '2021.emnlp-main', # May 17, 2021
    '2022.acl', # Nov 15, 2021
    '2022.naacl-main', # Jan 15, 2022
    '2022.emnlp-main', # Jun 24, 2022
    '2023.acl', # Dec 15, 2022
    '2023.emnlp-main', # Jun 23, 2023
]

for venue_index in venue_list:
    print()
    print(venue_index)
    extract_full_text_and_save(venue_index)
    print('Updating matched strings...')
    update_matched_strings(f'data/{venue_index}_extracted_full.json', path='data/', option='model_names')
    update_matched_strings(f'data/{venue_index}_extracted_full.json', path='data/', option='LM')


2020.acl-main
2020.acl-main has already been processed and stored.
Updating matched strings...

2020.emnlp-main
2020.emnlp-main has already been processed and stored.
Updating matched strings...

2021.naacl-main
2021.naacl-main has already been processed and stored.
Updating matched strings...

2021.acl
2021.acl has already been processed and stored.
Updating matched strings...

2021.emnlp-main
2021.emnlp-main has already been processed and stored.
Updating matched strings...

2022.acl
2022.acl has already been processed and stored.
Updating matched strings...

2022.naacl-main
2022.naacl-main has already been processed and stored.
Updating matched strings...

2022.emnlp-main
2022.emnlp-main has already been processed and stored.
Updating matched strings...

2023.acl
2023.acl has already been processed and stored.
Updating matched strings...

2023.emnlp-main
2023.emnlp-main has already been processed and stored.
Updating matched strings...
