# Sampling of the rated research for the expert validation of the research questions

In [233]:
import os
import csv
import random

In [234]:
def create_rated_papers_dict(folder_path):
    data_dict = {}
    
    # List of columns to include in the output
    columns_to_include = [
        'Domain', 
        'Knowledge-seeking vs. Eval', 
        'Nerd factor/zu spezifisch', 
        'Validation Nerd Factor', 
        'Distinguished', 
        'Bucket ID', 
        'Paper Name', 
        'Research Questions (max. 4)',
        'URL'
    ]
    
    
    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
                csvreader = csv.DictReader(csvfile)
                
                # Loop through each row in the CSV file
                for row in csvreader:
                    # Extract the relevant columns only
                    filtered_row = {col: row[col] for col in columns_to_include if col in row}
                    
                    # Extract domain from the row
                    domain = row['Domain']
                    
                    # Add the filtered row to the dictionary under the corresponding domain
                    if domain not in data_dict:
                        data_dict[domain] = []
                    data_dict[domain].append(filtered_row)
    
    return data_dict


In [235]:
def print_papers_per_domain(data_dict):
    for domain, papers in data_dict.items():
        print(f"Domain: {domain}")
        print("=" * 40)
        
        # Loop through each paper and print its details in one line
        for paper in papers:
            paper_details = f"Paper Name: {paper['Paper Name']}"
            
            # Append all the other relevant information
            for key, value in paper.items():
                if key != 'Paper Name':  # Skip the Paper Name itself
                    paper_details += f", {key}: {value}"
            
            # Print the paper details in one line
            print(paper_details)
        
        print("\n")


In [236]:
def filter_non_specific_papers(data_dict):
    filtered_data_dict = {}
    
    # Loop through each domain in the existing data dictionary
    for domain, papers in data_dict.items():
        # Filter the papers for this domain by checking if Bucket ID starts with 'B'
        filtered_papers = [
            paper for paper in papers if paper.get('Bucket ID', '').startswith('B')
        ]
        
        # If there are any filtered papers, add them to the new dictionary
        if filtered_papers:
            filtered_data_dict[domain] = filtered_papers
    
    return filtered_data_dict


In [237]:
def filter_specific_papers(data_dict):
    filtered_data_dict = {}
    
    # Loop through each domain in the existing data dictionary
    for domain, papers in data_dict.items():
        # Filter the papers for this domain where the "Validation Nerd Factor" is either 1 or -1
        filtered_papers = [
            paper for paper in papers 
            if paper.get('Validation Nerd Factor') in ['1', '-1']
        ]
        
        # If there are any filtered papers, add them to the new dictionary
        if filtered_papers:
            filtered_data_dict[domain] = filtered_papers
    
    return filtered_data_dict


In [238]:
#Debugging
def print_paper_info(data):
    for domain, papers in data.items():
        print(f"Domain: {domain}")
        for paper in papers:
            paper_name = paper.get('Paper Name', 'N/A')
            distinguished = paper.get('Distinguished', 'N/A')
            bucket_id = paper.get('Bucket ID', 'N/A')
            print(f"  Paper Name: {paper_name}")
            print(f"  Distinguished: {distinguished}")
            print(f"  Bucket ID: {bucket_id}")
        print('-' * 50)

In [239]:
# Debugging
def print_sampled_papers(sampled_papers):
    """
    Prints the list of sampled papers in a single-line, readable format without the research questions.

    Parameters:
    sampled_papers (list): A list of sampled papers where each paper is a dictionary.
    """
    # Check if there are any sampled papers
    if not sampled_papers:
        print("No papers were sampled.")
        return
    
    # Iterate over each sampled paper and print the details in a single line without research questions
    for idx, paper in enumerate(sampled_papers, 1):
        paper_info = (
            f"Paper {idx}: "
            f"Domain: {paper['Domain']}, "
            f"Paper Name: {paper['Paper Name']}, "
            f"Bucket ID: {paper['Bucket ID']}, "
            f"Distinguished: {paper['Distinguished']}, "
            f"Nerd factor/zu spezifisch: {paper['Nerd factor/zu spezifisch']}, "
            f"Validation Nerd Factor: {paper['Validation Nerd Factor']}"
        )
        print(paper_info)

In [240]:
folder_path = 'csv_files/rated_papers'
all_rated_papers = create_rated_papers_dict(folder_path)

In [241]:
non_specific_papers = filter_non_specific_papers(all_rated_papers)
specific_papers = filter_specific_papers(all_rated_papers)

## Sampling of non specific papers

Non specific papers are papers which contain topics and research questions which are considered easy to understand by the authors.

In [244]:
def sample_non_specific_papers(non_specific_papers):
    """
    Samples papers from each bucket with specific selection criteria:
    - 2 non-distinguished papers: first from agreed papers, then from disagreed papers
    - 1 distinguished paper: prioritizing agreed papers, then disagreed papers, 
      then non-distinguished agreed papers, finally non-distinguished disagreed papers
    
    Parameters:
    non_specific_papers (dict): Dictionary of domain names to paper lists
    
    Returns:
    list: Flat list of sampled papers (3 papers per bucket if available)
    """
    def get_papers_for_bucket(bucket_id):
        return [
            paper for papers in non_specific_papers.values() 
            for paper in papers if paper['Bucket ID'] == bucket_id
        ]
    
    def has_agreed_ratings(paper):
        return (paper.get("Nerd factor/zu spezifisch", "").strip() == "0" and 
                paper.get("Validation Nerd Factor", "").strip() == "0")
    
    def remove_paper(paper):
        for papers in non_specific_papers.values():
            if paper in papers:
                papers.remove(paper)
                break
    
    sampled_papers = []
    
    for bucket_id in [f"B{i}" for i in range(1, 15)]:
        bucket_papers = get_papers_for_bucket(bucket_id)
        if not bucket_papers:
            continue
            
        # Split papers by agreement and distinguished status
        papers_agreed = [p for p in bucket_papers if has_agreed_ratings(p)]
        papers_disagreed = [p for p in bucket_papers if not has_agreed_ratings(p)]
        
        # Sample 2 non-distinguished papers
        non_dist_papers = []
        
        # First try from agreed papers
        non_dist_agreed = [p for p in papers_agreed if p['Distinguished'] == 'FALSE']
        non_dist_papers.extend(non_dist_agreed[:2])
        
        # If needed, try from disagreed papers
        if len(non_dist_papers) < 2:
            needed = 2 - len(non_dist_papers)
            non_dist_disagreed = [p for p in papers_disagreed if p['Distinguished'] == 'FALSE']
            non_dist_papers.extend(non_dist_disagreed[:needed])
            
            if len(non_dist_papers) < 2:
                print(f"Error: Could not find enough non-distinguished papers for bucket {bucket_id}")
        
        # Sample 1 distinguished paper (following priority order)
        dist_paper = None
        
        # 1. Try distinguished paper from agreed papers
        dist_agreed = [p for p in papers_agreed if p['Distinguished'] == 'TRUE']
        if dist_agreed:
            dist_paper = dist_agreed[0]
        
        # 2. Try distinguished paper from disagreed papers
        if not dist_paper:
            dist_disagreed = [p for p in papers_disagreed if p['Distinguished'] == 'TRUE']
            if dist_disagreed:
                dist_paper = dist_disagreed[0]
        
        # 3. Try non-distinguished paper from agreed papers
        if not dist_paper:
            remaining_non_dist_agreed = [p for p in papers_agreed if p['Distinguished'] == 'FALSE' 
                                       and p not in non_dist_papers]
            if remaining_non_dist_agreed:
                dist_paper = remaining_non_dist_agreed[0]
        
        # 4. Try non-distinguished paper from disagreed papers
        if not dist_paper:
            remaining_non_dist_disagreed = [p for p in papers_disagreed if p['Distinguished'] == 'FALSE' 
                                          and p not in non_dist_papers]
            if remaining_non_dist_disagreed:
                dist_paper = remaining_non_dist_disagreed[0]
        
        if not dist_paper:
            print(f"Error: Could not find a suitable third paper for bucket {bucket_id}")
        
        # Remove sampled papers from the pool
        for paper in non_dist_papers:
            remove_paper(paper)
        
        if dist_paper:
            remove_paper(dist_paper)
            non_dist_papers.append(dist_paper)
        
        sampled_papers.extend(non_dist_papers)
    
    return sampled_papers

In [245]:
sampled_non_specific_papers = sample_non_specific_papers(non_specific_papers)

Error: Could not find a suitable third paper for bucket B5


## Sampling of specific papers

Specific papers are papers which contain topics and research questions labeled as too specific by the authors.

In [248]:
def sample_papers_by_domain(specific_papers):
    """
    Randomly selects one "Knowledge-seeking" and one "Evaluation" paper per domain, 
    removing them from the original dataset.

    Parameters:
    specific_papers (dict): A dictionary where keys are domain names and values are lists of paper dictionaries. The papers have been labeled as too specific.

    Returns:
    list: A list of sampled papers, each represented as a dictionary.
    """ 

    sampled_papers = []
    
    for domain, papers in list(specific_papers.items()):
        knowledge_seeking_papers = [paper for paper in papers if paper.get("Knowledge-seeking vs. Eval") == "Knowledge-seeking"]
        evaluation_papers = [paper for paper in papers if paper.get("Knowledge-seeking vs. Eval") == "Evaluation"]

        if not knowledge_seeking_papers:
            print(f"No knowledge-seeking papers for domain: {domain}")
        if not evaluation_papers:
            print(f"No evaluation papers for domain: {domain}")

        sampled_knowledge = random.choice(knowledge_seeking_papers).copy() if knowledge_seeking_papers else None
        sampled_evaluation = random.choice(evaluation_papers).copy() if evaluation_papers else None

        if sampled_knowledge:
            sampled_papers.append(sampled_knowledge)
            specific_papers[domain].remove(sampled_knowledge)  # This modifies original data
        elif len(evaluation_papers) > 1:
            sampled_papers.append(sampled_evaluation)
            specific_papers[domain].remove(sampled_evaluation)
            evaluation_papers.remove(sampled_evaluation)
            sampled_evaluation = random.choice(evaluation_papers).copy() if evaluation_papers else None
            print(f"Knowledge-seeking paper replaced with evaluation paper for domain: {domain}")

        if sampled_evaluation:
            sampled_papers.append(sampled_evaluation)
            specific_papers[domain].remove(sampled_evaluation)  # This modifies original data
    
    return sampled_papers  # Returns a flat list of sampled papers


In [249]:
sampled_specific_papers = sample_papers_by_domain(specific_papers)

No knowledge-seeking papers for domain: Evolution
No knowledge-seeking papers for domain: Requirements and modeling
Knowledge-seeking paper replaced with evaluation paper for domain: Requirements and modeling
No knowledge-seeking papers for domain: Human and social aspects


In [252]:
#Debugging
print_sampled_papers(sampled_specific_papers)

Paper 1: Domain: AI and software engineering, Auto-coding, Paper Name: Towards Understanding Fairness and its Composition in Ensemble Machine Learning, Bucket ID: -, Distinguished: FALSE, Nerd factor/zu spezifisch: -1, Validation Nerd Factor: 1
Paper 2: Domain: AI and software engineering, Auto-coding, Paper Name: An Empirical Study on Noisy Label Learning for Program Understanding, Bucket ID: -, Distinguished: FALSE, Nerd factor/zu spezifisch: -1 (NLL?), Validation Nerd Factor: 1
Paper 3: Domain: Testing and analysis, Paper Name: AST-Probe: Recovering abstract syntax trees from hidden representations of pre-trained language models, Bucket ID: -, Distinguished: FALSE, Nerd factor/zu spezifisch: -1, Validation Nerd Factor: 1
Paper 4: Domain: Testing and analysis, Paper Name: Mutation-based Fault Localization of Deep Neural Networks, Bucket ID: -, Distinguished: TRUE, Nerd factor/zu spezifisch: -1
(end-to end fault loc.?, mutation selection konnte ich mir einigermaßen herleiten --> hab a

## Save the sampled papers in a csv file

In [254]:
def update_paper_urls(sampled_non_specific_papers, csv_file_path="csv_files/session and papers all conferences.csv"):
    """
    Updates the 'URL' field in the `sampled_non_specific_papers` list with the URLs from the provided CSV file.
    
    Parameters:
    sampled_non_specific_papers (list): A list of sampled papers. Each paper is a dictionary containing various fields, including 'Paper Name' as the identifier.
    csv_file_path (str): The path to the CSV file that contains paper names and corresponding URLs.
    
    Returns:
    list: The `sampled_non_specific_papers` list with updated 'URL' fields for the corresponding papers.
    """
    
    # Create a dictionary to store paper names and their corresponding URLs from the CSV file
    paper_urls = {}
    
    with open(csv_file_path, mode='r', encoding='utf-8') as csvfile:
        csvreader = csv.DictReader(csvfile)
        for row in csvreader:
            paper_name = row.get('name')  # Paper name as identifier
            url = row.get('url')  # URL column in the CSV
            if paper_name and url:
                paper_urls[paper_name] = url
    
    # Iterate through the sampled_non_specific_papers list and update URLs
    for paper in sampled_non_specific_papers:
        paper_name = paper.get('Paper Name')  # The identifier used for matching
        
        if not paper_name:  # Skip papers without a valid 'Paper Name'
            print(f"Warning: Skipping paper with missing or invalid 'Paper Name'.")
            continue

        if paper_name in paper_urls:
            paper['URL'] = paper_urls[paper_name]  # Update the 'URL' field with the corresponding URL
        else:
            print(f"Warning: URL not found for paper with name '{paper_name}'")
    
    return sampled_non_specific_papers

In [255]:
def combine_and_sort_papers(sampled_specific_papers, sampled_non_specific_papers):
    """
    Combines the lists of sampled specific and non-specific papers and sorts them by domain.

    Parameters:
    sampled_specific_papers (list): A list of sampled specific papers.
    sampled_non_specific_papers (list): A list of sampled non-specific papers.

    Returns:
    list: A combined and sorted list of papers by domain.
    """
    combined_papers = sampled_specific_papers + sampled_non_specific_papers
    sorted_papers = sorted(combined_papers, key=lambda paper: paper.get("Domain", ""))
    
    return sorted_papers


In [256]:
def write_papers_to_csv(papers_list, output_csv_file="csv_files/papers_expert_study/sampled_papers_expert_val.csv"):
    """
    Writes the list of sampled papers into a CSV file.
    
    Parameters:
    papers_list (list): A list of sampled papers. Each paper is a dictionary containing various fields.
    output_csv_file (str): The path where the CSV file will be saved. Default is 'csv_files/papers_expert_val.csv'.
    
    Returns:
    None
    """
    # Check if the list is empty
    if not papers_list:
        print("Warning: The list of papers is empty. No CSV file will be created.")
        return
    
    # Define the header based on the keys of the first paper (assuming all papers have the same structure)
    fieldnames = papers_list[0].keys()

    # Open the CSV file in write mode
    with open(output_csv_file, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header (column names) to the CSV file
        writer.writeheader()
        
        # Write the rows of data
        for paper in papers_list:
            writer.writerow(paper)
    
    print(f"CSV file '{output_csv_file}' has been created successfully.")

Include all sampled papers in one list.

In [257]:
all_sampled_papers = combine_and_sort_papers(sampled_specific_papers, sampled_non_specific_papers)

Add urls to the sampled papers.

In [260]:
all_sampled_papers = update_paper_urls(all_sampled_papers, "csv_files/session and papers all conferences.csv")

Write the papers into a csv file

In [261]:
write_papers_to_csv(all_sampled_papers)

CSV file 'csv_files/papers_expert_val.csv' has been created successfully.
