# Processing the papers for the expert validation study

Automatic creation of short introduction texts for the papers.

## Add missing URLs

Start by importing a file containing missing URLs for the papers. The missing URLs were manually added.

In [31]:
import csv
import os
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import json
import requests

In [8]:
def extract_csv_to_list(folderpath, filename):
    """
    Extracts data from a CSV file and returns a list of dictionaries.

    Parameters:
    folderpath (str): The path to the folder containing the CSV file.
    filename (str): The name of the CSV file.

    Returns:
    list[dict]: A list of dictionaries where each dictionary represents a row in the CSV file.
                Each dictionary contains the keys:
                - "Paper Name"
                - "Research Questions"
                - "URL"
                - "Abstract" (if present in the CSV file)
    """
    csv_file_path = os.path.join(folderpath, filename)
    extracted_data = []
    
    with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for row in reader:
            subset = {
                "Paper Name": row["Paper Name"],
                "Research Questions": row["Research Questions (max. 4)"],
                "URL": row["URL"]
            }
            # Check if "Abstract" column exists and add it
            if "Abstract" in row:
                subset["Abstract"] = row["Abstract"]
            
            extracted_data.append(subset)
    
    return extracted_data


In [35]:
folderpath = "csv_files/papers_expert_study"
csv_file = "updated_url_papers_expert_val.csv"
sampled_papers_list = extract_csv_to_list(folderpath, csv_file)

## Add abstracts

In [None]:
def get_abstracts(url):
    """
    Scrapes abstracts from the given URL using predefined CSS selectors.

    This function sends an HTTP request to the specified URL, parses the HTML content 
    using BeautifulSoup, and extracts the abstract text based on a list of possible 
    selectors. It attempts to handle different website structures, including ArXiv, 
    IEEE Xplore, and ACM Digital Library.

    Parameters:
    url (str): The URL of the research paper or article from which to extract the abstract.

    Returns:
    list[str]: A list of extracted abstracts (as strings). If no abstracts are found,
               an empty list is returned.
    """
    headers = {"User-Agent": "Mozilla/5.0"}  # Avoid blocking by the server
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch page, status code: {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Adjust these selectors based on the website structure
    possible_selectors = [
       'section[role="doc-abstract"] div[role="paragraph"]', # for dl.acm.org
        'div.abstract-text div.col-12 div.u-mb-1 div[xplmathjax]', #ieeexplore.ieee.org
        'span.abstract.mathjax',
        'div.abstract',
        'p.abstract',
        'section.abstract',
        'span.abstract',
        'blockquote.abstract.mathjax'  # for arxiv.org
    ]
    
    abstracts = []
    for selector in possible_selectors:
        elements = soup.select(selector)
        for elem in elements:
            # Find the text content inside the blockquote and remove the descriptor span
            if selector == 'blockquote.abstract.mathjax':
                abstract_text = " ".join([text.strip() for text in elem.stripped_strings if text != "Abstract:"])
                abstracts.append(abstract_text)
            else:
                abstracts.append(elem.get_text(strip=True))

    if not abstracts:
        print("Could not scrape abstract")
    
    return abstracts

In [39]:
def add_abstracts_to_papers(sampled_papers_list):
    """Adds abstracts to the list of papers."""
    for paper in sampled_papers_list:
        url = paper["URL"]
        abstract = get_abstracts(url)  # Get the abstract using the URL
        paper["Abstract"] = abstract  # Add the "Abstract" to the dictionary
    
    return sampled_papers_list

In [29]:
def write_to_csv(data, filename):
    """
    Writes a list of dictionaries to a CSV file inside the 'csv_files/papers_expert_study' folder.

    Parameters:
    data (list of dict): A list of dictionaries where each dictionary represents a row in the CSV file.
    filename (str): The name of the CSV file to be created.

    Returns:
    None: The function writes the CSV file and prints a confirmation message upon success.
    """
    
    # Ensure the 'csv_files' directory exists
    os.makedirs('csv_files/papers_expert_study', exist_ok=True)  # Creates the folder if it doesn't exist
    
    # Construct the full file path by joining the folder name with the filename
    file_path = os.path.join('csv_files/papers_expert_study', filename)
    
    # Get the fieldnames from the first dictionary in the list (assumes all dicts have the same keys)
    fieldnames = data[0].keys()
    
    # Open the file in write mode, create a CSV DictWriter object
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # Write the header (fieldnames)
        writer.writeheader()
        
        # Write the rows (data)
        writer.writerows(data)
    
    print(f"Data successfully written to {file_path}")

Scrape the abstract:

In [40]:
updated_papers_list = add_abstracts_to_papers(sampled_papers_list)

Failed to fetch page, status code: 418
Could not scrape abstract
Could not scrape abstract
Failed to fetch page, status code: 418
Failed to fetch page, status code: 418
Failed to fetch page, status code: 418
Failed to fetch page, status code: 418
Could not scrape abstract
Failed to fetch page, status code: 418
Failed to fetch page, status code: 418
Failed to fetch page, status code: 418
Failed to fetch page, status code: 418
Could not scrape abstract
Failed to fetch page, status code: 418
Could not scrape abstract
Failed to fetch page, status code: 418
Could not scrape abstract
Could not scrape abstract
Could not scrape abstract
Could not scrape abstract
Could not scrape abstract
Could not scrape abstract
Paper Name: Towards Understanding Fairness and its Composition in Ensemble Machine Learning


KeyError: 'Research Questions (max. 4)'

Create an arfitfact:

In [52]:
filename = 'abstracts_papers_expert_val.csv'
write_to_csv(updated_papers_list, filename)

Data successfully written to csv_files/papers_expert_study/abstracts_papers_expert_val.csv


## Create introductory texts for each paper

First import a completed file containing the missing abstracts from a file that was manually completed:

In [9]:
folderpath = "csv_files/papers_expert_study"


In [10]:
csv_file = "completed_abstracts_expert_val.csv"

papers_list = extract_csv_to_list(folderpath, csv_file)

In [None]:
SYSTEM_PROMPT = """Write a concise introductory text in German for a paper based on the following abstract. 
                The text should provide a brief overview of the main themes and context of the paper without delving into specific methods, 
                results, or contributions. It should be written in an impersonal, third-person perspective (avoid using first-person plural like 
                'we' or 'our'). The tone should remain formal and academic. The introduction should be no longer than 2-3 sentences and should 
                strictly avoid mentioning the paper's contributions, findings, or implications. Focus solely on the broader subject matter and 
                relevance of the research field."""

In [22]:
def post_process(result):
    as_dict = json.loads(result)
    return as_dict

Send the request to the OpenAI API:

In [None]:
def create_abstract(paper_abstract, SYSTEM_PROMPT):
    client = OpenAI(base_url="http://172.26.92.115")
    
    data = {
        "model": "gpt-4o-2024-11-20",
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": paper_abstract}
        ]
    }
    
    url = "http://172.26.92.115/chat_completion"
    api_key = os.environ.get("OPENAI_API_KEY")

    # Send request
    response = requests.post(
        url, 
        headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
        json=data
    )

    # Check response
    if response.status_code == 200:
        return response.json()  # Return full JSON response
    else:
        return f"Error {response.status_code}: {response.text}"


In [24]:
#Debugging
paper_abstract = """[Continuous integration at scale is costly but essential to software development. Various test optimization techniques including test selection and prioritization aim to reduce the cost. Test batching is an effective alternative, but overlooked technique. This study evaluates parallelization’s effect by adjusting machine count for test batching and introduces two novel approaches.
We establish TestAll as a baseline to study the impact of parallelism and machine count on feedback time. We re-evaluate ConstantBatching and introduce DynamicBatching, which adapts batch size based on the remaining changes in the queue. We also propose TestCaseBatching, enabling new builds to join a batch before full test execution, thus speeding up continuous integration. Our evaluations utilize Ericsson’s results and 276 million test outcomes from open-source Chrome, assessing feedback time, execution reduction, and providing access to Chrome project scripts and data.
The results reveal a non-linear impact of test parallelization on feedback time, as each test delay compounds across the entire test queue. ConstantBatching, with a batch size of 4, utilizes up to 72% fewer machines to maintain the actual average feedback time and provides a constant execution reduction of up to 75%. Similarly, DynamicBatching maintains the actual average feedback time with up to 91% fewer machines and exhibits variable execution reduction of up to 99%. TestCaseBatching holds the line of the actual average feedback time with up to 81% fewer machines and demonstrates variable execution reduction of up to 67%. We recommend practitioners use DynamicBatching and TestCaseBatching to reduce the required testing machines efficiently. Analyzing historical data to find the threshold where adding more machines has minimal impact on feedback time is also crucial for resource-effective testing.]"""

try:
    result = create_abstract(paper_abstract, SYSTEM_PROMPT)
except Exception as e:
    print("Exception at " + "paper")

result = create_abstract(paper_abstract, SYSTEM_PROMPT)

In [25]:
#Debugging
content = result["choices"][0]["message"]["content"]
print(content)

Die effektive Gestaltung von Continuous Integration (CI) ist ein zentraler Aspekt der Softwareentwicklung, jedoch mit hohen Kosten verbunden. Testoptimierungsmethoden wie Testauswahl, -priorisierung und -bündelung zielen darauf ab, diese Kosten zu senken, insbesondere durch den gezielten Einsatz von Parallelisierung und Ressourcennutzung. Der vorliegende Beitrag untersucht maßgebliche Strategien zur Testbündelung und deren Auswirkungen auf die Feedbackzeiten sowie den Ressourceneinsatz in groß angelegten CI-Systemen.


In [None]:
# Debugging
print(result)

{'choices': [{'finish_reason': 'stop', 'index': 0, 'logprobs': None, 'message': {'content': 'Die effektive Gestaltung von Continuous Integration (CI) ist ein zentraler Aspekt der Softwareentwicklung, jedoch mit hohen Kosten verbunden. Testoptimierungsmethoden wie Testauswahl, -priorisierung und -bündelung zielen darauf ab, diese Kosten zu senken, insbesondere durch den gezielten Einsatz von Parallelisierung und Ressourcennutzung. Der vorliegende Beitrag untersucht maßgebliche Strategien zur Testbündelung und deren Auswirkungen auf die Feedbackzeiten sowie den Ressourceneinsatz in groß angelegten CI-Systemen.', 'function_call': None, 'refusal': None, 'role': 'assistant', 'tool_calls': None}}], 'created': 1739792957, 'id': 'chatcmpl-B1tufWDEToqHnQ2e2ol8Uwi6TuBVP', 'model': 'gpt-4o-2024-11-20', 'object': 'chat.completion', 'service_tier': 'default', 'system_fingerprint': 'fp_a82c03666f', 'usage': {'completion_tokens': 112, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'au

In [None]:
# TO-Do: Finish and use this function after the testing and debugging is finished
def get_introductory_text(papers_list):
    """
    Iterates through papers_list, adds the key 'Introductory text' to each paper,
    and populates it using the analyze_paper function.
    
    :param papers_list: List of dictionaries containing paper details.
    """
    for paper in papers_list:
        if 'Abstract' in paper:
            try:
                paper_abstract = paper['Abstract']
                #result = create_abstract(paper['Abstract'], SYSTEM_PROMPT)
                introductory_text = result["choices"][0]["message"]["content"]
                paper['Introductory text'] = introductory_text
            except Exception as e:
                print("Exception at " + paper)
            
    return papers_list

In [None]:
# Remove comment to call the function and create introductory texts for each entry
# get_introductory_text(papers_list)

## Extract the properties for the research questions

In [None]:
SYSTEM_PROMPT_PROPERRTIES = """Using the provided abstract and research question, identify the key properties measured to answer each research question. 
                            Examples of such properties include accuracy, usability, reliability, performance, portability, CPU usage, and runtime. 
                            List the relevant properties for each research question separately, ensuring they align with the details in the abstract."""

In [None]:
def extract_properies(paper_abstract, paper_rqs, SYSTEM_PROMPT_PROPERRTIES):
    client = OpenAI(base_url="http://172.26.92.115")
    
    data = {
        "model": "gpt-4o-2024-11-20",
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT_PROPERRTIES},
            {"role": "user", "content": paper_abstract},
            {"role": "user", "content": paper_rqs}
        ]
    }
    
    url = "http://172.26.92.115/chat_completion"
    api_key = os.environ.get("OPENAI_API_KEY")

    # Send request
    response = requests.post(
        url, 
        headers={'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'},
        json=data
    )

    # Check response
    if response.status_code == 200:
        return response.json()  # Return full JSON response
    else:
        return f"Error {response.status_code}: {response.text}"


In [None]:
# TO-Do: Finish and use this function after the testing and debugging is finished
def get_properties(papers_list):
    """
    Iterates through papers_list, adds the key 'Properties' to each paper,
    and populates it using the extract_properies function.
    
    :param papers_list: List of dictionaries containing paper details.
    """
    for paper in papers_list:
        if 'Abstract' and 'Research Questions' in paper:
            try:
                paper_abstract = paper['Abstract']
                paper_rqs = paper['Research Questions']
                # To-Do: Remove comment to extract properties for all papers
                #result = extract_properies(paper_abstract, paper_rqs, SYSTEM_PROMPT_PROPERRTIES)
                properties = result["choices"][0]["message"]["content"]
                paper['Properties'] = properties
            except Exception as e:
                print("Exception at " + paper)
            
    return papers_list

## Save everything in a csv file

In [None]:
def write_to_csv(data, filename):
    """
    Writes a list of dictionaries to a CSV file inside the 'csv_files/papers_expert_study' folder.

    Parameters:
    data (list of dict): A list of dictionaries where each dictionary represents a row in the CSV file.
    filename (str): The name of the CSV file to be created.

    Returns:
    None: The function writes the CSV file and prints a confirmation message upon success.
    """
    
    # Ensure the 'csv_files' directory exists
    os.makedirs('csv_files/papers_expert_study', exist_ok=True)  # Creates the folder if it doesn't exist
    
    # Construct the full file path by joining the folder name with the filename
    file_path = os.path.join('csv_files/papers_expert_study', filename)
    
    # Get the fieldnames from the first dictionary in the list (assumes all dicts have the same keys)
    fieldnames = data[0].keys()
    
    # Open the file in write mode, create a CSV DictWriter object
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # Write the header (fieldnames)
        writer.writeheader()
        
        # Write the rows (data)
        writer.writerows(data)
    
    print(f"Data successfully written to {file_path}")
