In [45]:
import re
import requests
from io import BytesIO
import PyPDF2
import pdfplumber
import feedparser

# Sample list of links to PDF research papers
links = [
    "https://arxiv.org/pdf/2208.05958.pdf", 
    "https://arxiv.org/pdf/2208.01863.pdf", 
    "https://arxiv.org/pdf/2206.07559.pdf", 
    "https://arxiv.org/pdf/2206.05302.pdf", 
    "https://arxiv.org/pdf/2206.11888.pdf", 
    "https://arxiv.org/pdf/2206.08811v1.pdf", 
    "https://arxiv.org/pdf/2205.11427.pdf", 
    "https://arxiv.org/pdf/2204.12985.pdf", 
    "https://arxiv.org/pdf/2204.09725.pdf", 
    "https://arxiv.org/pdf/2203.15546.pdf", 
    "https://arxiv.org/pdf/2203.11216.pdf", 
    "https://arxiv.org/pdf/2201.13250v1.pdf", 
    "https://arxiv.org/pdf/2201.05032.pdf", 
    "https://arxiv.org/pdf/2111.06741.pdf", 
    "https://arxiv.org/pdf/2111.10244.pdf", 
    "https://arxiv.org/pdf/2110.08163.pdf", 
    "https://arxiv.org/pdf/2109.11285.pdf", 
    "https://arxiv.org/pdf/2110.06898v1.pdf", 
    "https://arxiv.org/pdf/2110.04236.pdf", 
    "https://arxiv.org/pdf/2109.08401.pdf", 
    "https://arxiv.org/pdf/2109.04842.pdf", 
    "https://arxiv.org/pdf/2109.04840.pdf", 
    "https://arxiv.org/pdf/2109.03745.pdf", 
    "https://arxiv.org/pdf/2109.03687.pdf", 
    "https://arxiv.org/pdf/2107.04411.pdf", 
    "https://arxiv.org/pdf/2106.10055.pdf", 
    "https://arxiv.org/pdf/2106.07485.pdf", 
    "https://arxiv.org/pdf/2104.04352.pdf", 
    "https://arxiv.org/pdf/2103.15470.pdf", 
    "https://arxiv.org/pdf/2103.07960.pdf", 
    "https://arxiv.org/pdf/2103.06720.pdf", 
    "https://arxiv.org/pdf/2102.12846.pdf", 
    "https://arxiv.org/pdf/2102.10984.pdf", 
    "https://arxiv.org/pdf/2101.02240.pdf", 
    "https://arxiv.org/pdf/2012.03755.pdf", 
    "https://arxiv.org/pdf/2012.03756.pdf", 
    "https://arxiv.org/pdf/2011.01125.pdf", 
    "https://arxiv.org/pdf/2009.12361.pdf", 
    "https://arxiv.org/pdf/2009.06551.pdf", 
    "https://arxiv.org/pdf/2008.08694.pdf", 
    "https://arxiv.org/pdf/2007.15957.pdf", 
    "https://arxiv.org/pdf/2007.10515.pdf", 
    "https://arxiv.org/pdf/2005.04147.pdf", 
    "https://arxiv.org/pdf/2005.02975.pdf", 
    "https://arxiv.org/pdf/2001.00862.pdf", 
    "https://arxiv.org/pdf/1910.05168.pdf", 
    "https://arxiv.org/pdf/1910.04735.pdf"
]

# List of keywords to check in the authors' section
keywords = ["IBM", "Quantinuum", "Cambridge Quantum"]

def download_pdf(link):
    response = requests.get(link)
    pdf_file = BytesIO(response.content)
    return pdf_file

def extract_arxiv_id_from_pdf_link(link):
    # Extract the arXiv ID from the PDF link
    arxiv_id = link.split('/')[-1].split('.pdf')[0]
    return arxiv_id

def extract_title_from_arxiv_id(arxiv_id):
    # Fetch the metadata from the arXiv API
    api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    feed = feedparser.parse(api_url)

    # Extract the title from the metadata
    if 'entries' in feed and len(feed['entries']) > 0:
        title = feed['entries'][0]['title']
        # Join the title lines into a single line
        title = ''.join(title.split('\n')).strip()
        
        return title
    
    return ""

def extract_affiliations(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        first_page = pdf.pages[0]
        text = first_page.extract_text()

        # Split the text into lines
        lines = text.split('\n')

        # Find the line containing the author names
        author_line_index = -1
        for i, line in enumerate(lines):
            if re.search(r'\b(?:and|,)\s+\b', line):
                author_line_index = i
                break

        # Extract the affiliations after the author names
        affiliations = ' '.join(lines[author_line_index + 1:])

        return affiliations

def contains_keywords(affiliations, keywords):
    for keyword in keywords:
        if keyword.lower() in affiliations.lower():
            return True
    return False

matching_papers = 0

for link in links:
    pdf_file = download_pdf(link)
    arxiv_id = extract_arxiv_id_from_pdf_link(link)
    title = extract_title_from_arxiv_id(arxiv_id)
    affiliations = extract_affiliations(pdf_file)
    
    if contains_keywords(affiliations, keywords):
        print(f"Found {keywords} in the affiliations section: {link}\n\t -- {title}")
        matching_papers+=1
    else:
        print(f"No {keywords} found in the affiliations section: {link}\n\t -- {title}")
        print(affiliations)
        break
        
    print(matching_papers)

Found ['IBM', 'Quantinuum', 'Cambridge Quantum'] in the affiliations section: https://arxiv.org/pdf/2208.05958.pdf
	 -- Efficient recovery of variational quantum algorithms landscapes using  classical signal processing
1
Found ['IBM', 'Quantinuum', 'Cambridge Quantum'] in the affiliations section: https://arxiv.org/pdf/2208.01863.pdf
	 -- Implementing Fault-tolerant Entangling Gates on the Five-qubit Code and  the Color Code
2
Found ['IBM', 'Quantinuum', 'Cambridge Quantum'] in the affiliations section: https://arxiv.org/pdf/2206.07559.pdf
	 -- Bayesian Learning of Parameterised Quantum Circuits
3
Found ['IBM', 'Quantinuum', 'Cambridge Quantum'] in the affiliations section: https://arxiv.org/pdf/2206.05302.pdf
	 -- Predicting Gibbs-State Expectation Values with Pure Thermal Shadows
4
Found ['IBM', 'Quantinuum', 'Cambridge Quantum'] in the affiliations section: https://arxiv.org/pdf/2206.11888.pdf
	 -- Transport of multispecies ion crystals through a junction in an RF Paul  trap
5
Found