In [None]:
import re
import os
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# Define the path where the PDF files will be saved
path = 'D:/Studium/Masterarbeit/PythonProject/data/PDF_data/'

# Open the file containing the list of links to CLEF proceedings and read all links into a list
with open("D:/Studium/Masterarbeit/PythonProject/data/links.txt") as file:
    publisher_link_list_doi_new = [line.rstrip() for line in file]


for i in publisher_link_list_doi_new[10:13]:
    resp = requests.get(i)
    
    # Extract the book name from the page and clean it up
    soup_book_name = BeautifulSoup(resp.text, features="html.parser").find("p", attrs={"data-test" : "book-subtitle"})
    book_name = soup_book_name.text.split(",")[0]

    # Try to create a directory for the book; ignore error if the directory already exists
    try:
        os.mkdir(path + "LNCS_Proceedings/" + book_name)
    except:
        pass
        
    # Find the section of the page that contains the table of contents
    soup = BeautifulSoup(resp.text, features="html.parser").find("section", attrs={"data-title" : "Table of contents"})
    
    # Get all the chapter headings and the lists of publications under each heading
    headings = soup.findAll("h3", attrs={"class" : "c-book-part-heading--underline"})
    lists_of_publications = soup.findAll("ol", attrs={"class": "u-list-reset"})

    # Iterate over each chapter and its corresponding list of publications
    for j in range(len(headings)):
        try:
            # Clean the chapter name by removing invalid characters
            chapter_name = headings[j].text.replace(":", "")
            chapter_name = chapter_name.replace("?", "")
            
            # Try to create a directory for the chapter; ignore error if it already exists
            os.mkdir(path + "LNCS_Proceedings/" + book_name + "/" + chapter_name)
        except:
            pass
            
        # Find all links to individual publications within the chapter
        chapter_publication_links = lists_of_publications[j].findAll("a",attrs={"data-track-action" : "ToC link to content page"})
        
        # Iterate over each publication link
        for k in chapter_publication_links:
            # Extract the DOI part from the link using regex
            doi_suffix = re.search(r'/chapter/10.1007/(.+?)(?:/|$)', k.get("href")).group(1)
            
            # Clean the title by removing or replacing invalid characters
            title = k.text.replace("?","")
            title = title.replace("<i>","").replace("</i>", "")
            
            # Construct the full filename for saving the PDF, and ensure the path is not too long
            filename = path + "LNCS_Proceedings/" + book_name + "/" + chapter_name + "/" + title + ".pdf"
            if len(filename) > 260:
                diff = len(filename) - 264
                filename = filename[:-diff] + ".pdf"
            
            # Download the PDF file from the constructed URL and save it to the appropriate location
            with open(filename, 'wb') as f:
                f.write(requests.get(urljoin("https://link.springer.com/content/pdf/10.1007/", doi_suffix)).content)

In [90]:
documents_data = []

# Open the file containing the list of links to CLEF proceedings and read all links into a list
with open("D:/Studium/Masterarbeit/PythonProject/data/links.txt") as file:
    publisher_link_list_doi_new = [line.rstrip() for line in file]
    
# Iterate over each link in the list
for i in publisher_link_list_doi_new:
    # Send a GET request to the URL and parse the HTML response
    resp = requests.get(i)
    
    # Extract the book subtitle and title
    soup_book_name = BeautifulSoup(resp.text, features="html.parser").find("p", attrs={"data-test" : "book-subtitle"})
    soup_book_title = BeautifulSoup(resp.text, features="html.parser").find("h1", attrs={"data-test" : "book-title"})
    book_name = soup_book_name.text.split(",")[0]
    
    # Adjust the book name if it ends with "Part I" or "Part II"
    if soup_book_name.text.strip().endswith("Part I") or soup_book_name.text.strip().endswith("Part II"): 
        book_name += " (" + soup_book_name.text.split(",")[-1].strip() + ")"

    # Extract the publication year from the page
    soup_book_name = BeautifulSoup(resp.text, features="html.parser").findAll("li", attrs={"class" : "c-article-identifiers__item"})
    year = soup_book_name[1].text.replace("© ", "")
    
    # Find all chapter links in the book
    soup = BeautifulSoup(resp.text, features="html.parser").findAll("a", attrs={"href" : re.compile("/chapter/10.1007/")})
    
    # Iterate over each chapter link found
    for j in soup:
        document_info = []
        try:
            # Add the year, book name, and book title to the list for this document
            document_info.extend([year.group(), book_name, soup_book_title.text.rstrip().lstrip()])
        except:
            # Handle case where year is not found
            document_info.extend(["Not found", book_name, soup_book_title.text.rstrip().lstrip()])
        
        # Extract the href attribute and modify it to form the PDF file name
        href = j.get("href")
        href_2 = re.sub("/chapter/10.1007/", "",  href) + ".pdf"
        document_info.append(href_2)

        # Find the previous h3 (chapter title) and h4 (sub-chapter title) elements
        previous_h3 = j.find_previous("h3", attrs={"data-title" : "part-title"})
        
        # Check if there is an h4 element (sub-chapter title)
        if j.find_previous("h4", attrs={"data-title" : "subpart-title"}):
            previous_h4 = j.find_previous("h4", attrs={"data-title" : "subpart-title"})
            try:
                # Add the chapter title, part title, and sub-part title to the list
                document_info.extend([j.text, previous_h3.text, previous_h4.text])
            except:
                # Handle case where part title is not found
                document_info.extend([j.text, "Not found", previous_h4.text])
        else:
            try:
                # Add the chapter title and part title, with "Uncategorized" as sub-part title
                document_info.extend([j.text, previous_h3.text, "Uncategorized"])
            except:
                # Handle case where part title is not found
                document_info.extend([j.text, "Not found", "Uncategorized"])
        
        # Send a request to the chapter page to extract additional information
        resp = requests.get("https://link.springer.com" + href)

        # Extract the authors and affiliations
        authors_affiliations = BeautifulSoup(resp.text, features="html.parser").find("ol", attrs={"class" : "c-article-author-affiliation__list"})

        # Extract the DOI and citation count
        doi_soup = BeautifulSoup(resp.text, features="html.parser").findAll("span", attrs={"class" : "c-bibliographic-information__value"})
        try:
            citation_soup = BeautifulSoup(resp.text, features="html.parser").findAll("li", attrs={"class":"app-article-metrics-bar__item"})
            document_info.extend([doi_soup[0].text, re.sub("Citations", "", citation_soup[1].text).lstrip().rstrip()])
        except:
            # Handle case where citation count is not found
            document_info.extend([doi_soup[0].text, "Not found"])
        
        # Extract the list of authors and their affiliations
        authors_affiliations_list = []
        for li in authors_affiliations.find_all('li'):
            author_info = []
            # Iterate through all p elements within the li element (author name and affiliation)
            for p in li.find_all('p'):
                if len(author_info) == 0:
                    # Add the author's name to the list
                    author_info.append(p.text)
                else:
                    # Split the affiliations and add them to the list
                    author_info.append(re.split(r'\s&\s|,\s', p.text))
                
            authors_affiliations_list.append(author_info)
        # Add the authors and affiliations to the list for this document
        documents_data.append(authors_affiliations_list)
        
        # Add the complete document information to the main list
        documents_data.append(document_info)    

https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1 else, except
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-031-04431-1
https://doi.org/10.1007/978-3-642-23008-0
https://doi.org/10.1007/978-3-642-23008-0 else, except
https://doi.org/10.1007/978-3-642-23008-0
https://doi.org/10.1007/978-3-642-23008-0 else, except
https://doi.org/10.1007/978-3-642-23008-0
https://doi.org/10.1007/978-3-642-23008-0 else, except
https://doi.org/10.1007/978-3-642-23008-0
https://doi.org/10.1007/

In [94]:
columns = ["PubYear", "Book Subtitle", "Book Title", "Filename", "Title", "Section", "Subsection", "DOI", "Citation count", "Authors & Affiliations"]
publication_metadata_df = pd.DataFrame(documents_data, columns=columns)

In [96]:
publication_metadata_df.to_parquet("../../../data/metadata_LNCS.parquet")