# Sampling of RQs

In [1]:
from collections import defaultdict
import csv
import pprint
import random
import os

In [2]:
def parse_papers_csv(csv_file):
    """
    Parses a CSV file containing information about research papers and organizes it into a structured dictionary.

    Args:
        csv_file (str): The path to the CSV file containing the paper data. The CSV file should have the following columns:
            - 'Domain': The domain or category of the paper.
            - 'Paper Name': The title of the paper.
            - 'Authors': A comma-separated list of authors.
            - 'Distinguished': A string indicating whether the paper is distinguished ('True' or 'False').

    Returns:
        defaultdict: A nested dictionary where each domain key maps to a dictionary containing:
            - 'papers': A list of dictionaries, each representing a paper with the following keys:
                - 'name' (str): The title of the paper.
                - 'authors' (str): A comma-separated string of author names.
                - 'distinguished' (bool): A boolean indicating if the paper is distinguished.
                - 'url' (str): A placeholder string ('No URL available') for the paper's URL.
    """
    merged_dict_all = defaultdict(lambda: {'papers': []})

    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            domain = row['Domain']
            paper_name = row['Paper Name']
            
            authors = [author.strip() for author in row['Authors'].split(',')]
            authors = ', '.join(authors)
            
            distinguished = row['Distinguished'] == 'True'

            paper_entry = {
                'name': paper_name,
                'authors': authors,
                'distinguished': distinguished,
                'url': url
            }

            merged_dict_all[domain]['papers'].append(paper_entry)

    return merged_dict_all

In [3]:
def print_papers_by_domain(merged_dict_all):
    """
    Prints a list of papers organized by domain in a readable format.

    Args:
        merged_dict_all (dict): A dictionary where each key is a domain, and the value is a dictionary containing:
            - 'papers': A list of dictionaries, each representing a paper with the following keys:
                - 'name' (str): The title of the paper.
                - 'authors' (str): A comma-separated string of author names.
                - 'distinguished' (bool): A boolean indicating if the paper is distinguished.
                - 'url' (str): A placeholder string for the paper's URL.
    """
    # Iterate over each domain in merged_dict_all
    for domain, domain_data in merged_dict_all.items():
        print(f"Domain: {domain}")
        # Iterate over the list of papers in each domain
        for paper in domain_data['papers']:
            print(f"  Paper Name: {paper['name']}")
            #print(f"    Authors: {paper['authors']}")
            print(f"    Distinguished: {paper['distinguished']}")
            #print(f"    URL: {paper['url']}")
        print("\n")  # Print a newline between domains

In [4]:
csv_file = 'csv_files/papers_by_domain.csv'
merged_dict_all = parse_papers_csv(csv_file)

In [6]:
#print_papers_by_domain(merged_dict_all)

Debugging:

In [20]:
def count_papers(paper_dict):
    """
    Counts the total number of papers in the dictionary.

    Args:
        paper_dict (defaultdict): A dictionary with paper data, including "papers" as a key.

    Returns:
        int: The total number of papers in the dictionary.
    """
    total_papers = 0
    for domain, data in paper_dict.items():
        if "papers" in data:
            total_papers += len(data["papers"])
    return total_papers



In [21]:
# Count the total number of papers
total_papers = count_papers(merged_dict_all)
print(total_papers)  

921


In [22]:
papers_to_remove = [
    "Improving the Learning of Code Review Successive Tasks with Cross-Task Knowledge Distillation",
    "Can Machine Learning Pipelines Be Better Configured?",
    "Benchmarking Robustness of AI-enabled Multi-sensor Fusion Systems: Challenges and Opportunities",
    "An Empirical Study on Noisy Label Learning for Program Understanding",
    "EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning",
    "Harnessing Neuron Stability to Improve DNN Verification",
    "Towards Finding Accounting Errors in Smart Contracts",
    "Detecting Blocking Errors in Go Programs using Localized Abstract Interpretation",
    "NeuRI: Diversifying DNN Generation via Inductive Rule Inference",
    "Is unsafe an Achilles' Heel? A Comprehensive Study of Safety Requirements in Unsafe Rust Programming",
    "Generating Realistic and Diverse Tests for LiDAR-Based Perception Systems",
    "Provably Tightest Linear Approximation for Robustness Verification of Sigmoid-like Neural Networks",
    "Modularizing while Training: a New Paradigm for Modularizing DNN Models",
    "A Highly Scalable, Hybrid, Cross-Platform Timing Analysis Framework Providing Accurate Differential Throughput Estimation via Instruction-Level Tracing",
    "Code Search is All You Need? Improving Code Suggestions with Code Search",
    "GrACE: Language Models Meet Code Edits",
    "Using Deep Learning to Automatically Improve Code Readability",
    "When Neural Code Completion Models Size up the Situation: Attaining Cheaper and Faster Completion through Dynamic Model Inference",
    "Sibyl: Improving Software Engineering Tools with SMT Selection",
    "HyperAST: Enabling Efficient Analysis of Software Histories at Scale",
    "DeepScaler: Holistic Autoscaling for Microservices Based on Spatiotemporal GNN with Adaptive Graph Learning",
    "On Using GUI Interaction Data to Improve Text Retrieval-based Bug Localization",
    "Repeated Builds During Code Review: An Empirical Study of the OpenStack Community",
    "A Large-Scale Empirical Study on Semantic Versioning in Golang Ecosystem",
    "A Qualitative Study on the Implementation Design Decisions of Developers",
    "Analyzing and Debugging Normative Requirements via Satisfiability Checking",
    "Compiler Testing using Template Java Programs",
    "ReFAIR: Toward a Context-Aware Recommender for Fairness Requirements Engineering",
    "TRIAD: Automated Traceability Recovery based on Biterm-enhanced Deduction of Transitive Links among Artifacts",
    "BSHUNTER: Detecting and Tracing Defects of Bitcoin Scripts",
    "A Longitudinal Study of Student Contributions to OSS vs. OSS4SG with a Lightweight Intervention",
    "Do I Belong? Modeling Sense of Virtual Community Among Linux Kernel Contributors",
    "GenderMag Improves Discoverability in the Field, Especially for Women",
    "[Remote] Understanding the topics and challenges of GPU programming by classifying and analyzing Stack Overflow posts",
    "Towards Greener Yet Powerful Code Generation via Quantization: An Empirical Study",
    "How Are Paid and Volunteer Open Source Developers Different? A Study of the Rust Project",
    "Mate! Are You Really Aware? An Explainability-Guided Testing Framework for Robustness of Malware Detectors",
    "Lejacon: A Lightweight and Efficient Approach to Java Confidential Computing on SGX",
    "Static Application Security Testing (SAST) Tools for Smart Contracts: How Far Are We?",
    "Leveraging Practitioners' Feedback to Improve a Security Linter",
    "Automated Black-box Testing of Mass Assignment Vulnerabilities in RESTful APIs",
    "ViaLin: Path-Aware Dynamic Taint Analysis for Android"
]

In [23]:
def are_papers_in_dict(paper_dict, paper_names):
    """
    Checks whether papers with the given names exist in the dictionary.

    Args:
        paper_dict (defaultdict): A dictionary with paper data, including "papers" as a key.
        paper_names (list): A list of paper names to search for.

    Returns:
        dict: A dictionary mapping each paper name to True if it exists, False otherwise.
    """
    result = {}
    # Flatten the list of all paper names in the dictionary for quick lookup
    existing_papers = set(
        paper["name"] for domain, data in paper_dict.items() if "papers" in data
        for paper in data["papers"]
    )

    # Check if each paper name is in the existing papers set
    for paper_name in paper_names:
        result[paper_name] = paper_name in existing_papers

    # Print each entry with a line break
    for paper_name, exists in result.items():
        print(f"{paper_name}: {'Found' if exists else 'Not Found'}\n")

    return result

are_papers_in_dict(merged_dict_all, papers_to_remove)

Improving the Learning of Code Review Successive Tasks with Cross-Task Knowledge Distillation: Found

Can Machine Learning Pipelines Be Better Configured?: Found

Benchmarking Robustness of AI-enabled Multi-sensor Fusion Systems: Challenges and Opportunities: Found

An Empirical Study on Noisy Label Learning for Program Understanding: Found

EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning: Found

Harnessing Neuron Stability to Improve DNN Verification: Found

Towards Finding Accounting Errors in Smart Contracts: Found

Detecting Blocking Errors in Go Programs using Localized Abstract Interpretation: Found

NeuRI: Diversifying DNN Generation via Inductive Rule Inference: Found

Is unsafe an Achilles' Heel? A Comprehensive Study of Safety Requirements in Unsafe Rust Programming: Found

Generating Realistic and Diverse Tests for LiDAR-Based Perception Systems: Found

Provably Tightest Linear Approximation for Robustness Verification of Sigmoid-like 

{'Improving the Learning of Code Review Successive Tasks with Cross-Task Knowledge Distillation': True,
 'Can Machine Learning Pipelines Be Better Configured?': True,
 'Benchmarking Robustness of AI-enabled Multi-sensor Fusion Systems: Challenges and Opportunities': True,
 'An Empirical Study on Noisy Label Learning for Program Understanding': True,
 'EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning': True,
 'Harnessing Neuron Stability to Improve DNN Verification': True,
 'Towards Finding Accounting Errors in Smart Contracts': True,
 'Detecting Blocking Errors in Go Programs using Localized Abstract Interpretation': True,
 'NeuRI: Diversifying DNN Generation via Inductive Rule Inference': True,
 "Is unsafe an Achilles' Heel? A Comprehensive Study of Safety Requirements in Unsafe Rust Programming": True,
 'Generating Realistic and Diverse Tests for LiDAR-Based Perception Systems': True,
 'Provably Tightest Linear Approximation for Robustness Verif

# Sampling of the Papers

## Sample 6 papers per domain (3 distinguished and 3 not distinguished)

In [24]:
def sample_papers(merged_dict_all):
    """
    Randomly samples up to 6 papers per domain from the given dataset.

    Args:
        merged_dict_all (dict): A dictionary where each domain maps to a dictionary containing a list of papers.

    Returns:
        dict: A dictionary with the same domain keys, where each domain maps to a list of up to 6 sampled papers 
              (3 distinguished and 3 non-distinguished, if available).
    """
    sampled_papers = {}

    # Iterate over each domain in merged_dict_all
    for domain, domain_data in merged_dict_all.items():
        # Separate papers into distinguished and non-distinguished lists
        distinguished_papers = [paper for paper in domain_data['papers'] if paper['distinguished']]
        non_distinguished_papers = [paper for paper in domain_data['papers'] if not paper['distinguished']]

        # Sample 3 distinguished papers and 3 non-distinguished papers (if possible)
        sampled_distinguished = random.sample(distinguished_papers, min(3, len(distinguished_papers)))
        sampled_non_distinguished = random.sample(non_distinguished_papers, min(3, len(non_distinguished_papers)))

        # Combine the sampled papers
        sampled_papers[domain] = sampled_distinguished + sampled_non_distinguished

    return sampled_papers


In [25]:
def write_sampled_papers_to_csv(sampled_papers, filename='sampled_papers.csv'):
    """
    Writes sampled papers data to a CSV file.

    Args:
        sampled_papers (dict): A dictionary where each key is a domain, and the value is a list of sampled paper dictionaries.
        filename (str, optional): The name of the CSV file to write to. Defaults to 'sampled_papers.csv'.

    Behavior:
        - Writes a header row to the CSV file.
        - Iterates through the sampled papers and writes each paper's details (domain, name, authors, distinguished status, and URL) as a row.
    """
    os.makedirs('csv_files', exist_ok=True)
    filepath = os.path.join('csv_files', filename)

    # Define the header for the CSV
    header = ['Domain', 'Paper Name', 'Authors', 'Distinguished', 'URL']

    # Open the CSV file in write mode
    with open(filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)

        # Iterate over the sampled papers and write each one to the CSV
        for domain, papers in sampled_papers.items():
            for paper in papers:
                writer.writerow([domain, paper['name'], paper['authors'], paper['distinguished'], paper['url']])

In [26]:
"""This has become obsolete since the current Sampling strategy is the one that includes the removal of the already sampled papers"""

"""sampled_papers = sample_papers(merged_dict_all)
for domain, papers in sampled_papers.items():
    print(f"Domain: {domain}")
    for paper in papers:
        print(f"  Paper Name: {paper['name']}")
        #print(f"    Authors: {paper['authors']}")
        print(f"    Distinguished: {paper['distinguished']}")
        #print(f"    URL: {paper['url']}")
    print("\n")  """

'sampled_papers = sample_papers(merged_dict_all)\nfor domain, papers in sampled_papers.items():\n    print(f"Domain: {domain}")\n    for paper in papers:\n        print(f"  Paper Name: {paper[\'name\']}")\n        #print(f"    Authors: {paper[\'authors\']}")\n        print(f"    Distinguished: {paper[\'distinguished\']}")\n        #print(f"    URL: {paper[\'url\']}")\n    print("\n")  '

In [27]:
"""write_sampled_papers_to_csv(sampled_papers, 'sampled_papers.csv')"""

"write_sampled_papers_to_csv(sampled_papers, 'sampled_papers.csv')"

## Sampling including the removal of the already sampled papers
Remove the papers that have already been sampled and labeled.

The following papers have been removed from the pool to avoid the sampling of duplicate papers (after Sampling Round 3). The remaining papers are saved inside current_papers_pool.csv.

In [None]:
papers_to_remove = [
    "Improving the Learning of Code Review Successive Tasks with Cross-Task Knowledge Distillation",
    "Can Machine Learning Pipelines Be Better Configured?",
    "Benchmarking Robustness of AI-enabled Multi-sensor Fusion Systems: Challenges and Opportunities",
    "An Empirical Study on Noisy Label Learning for Program Understanding",
    "EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning",
    "Harnessing Neuron Stability to Improve DNN Verification",
    "Towards Finding Accounting Errors in Smart Contracts",
    "Detecting Blocking Errors in Go Programs using Localized Abstract Interpretation",
    "NeuRI: Diversifying DNN Generation via Inductive Rule Inference",
    "Is unsafe an Achilles' Heel? A Comprehensive Study of Safety Requirements in Unsafe Rust Programming",
    "Generating Realistic and Diverse Tests for LiDAR-Based Perception Systems",
    "Provably Tightest Linear Approximation for Robustness Verification of Sigmoid-like Neural Networks",
    "Modularizing while Training: a New Paradigm for Modularizing DNN Models",
    "A Highly Scalable, Hybrid, Cross-Platform Timing Analysis Framework Providing Accurate Differential Throughput Estimation via Instruction-Level Tracing",
    "Code Search is All You Need? Improving Code Suggestions with Code Search",
    "GrACE: Language Models Meet Code Edits",
    "Using Deep Learning to Automatically Improve Code Readability",
    "When Neural Code Completion Models Size up the Situation: Attaining Cheaper and Faster Completion through Dynamic Model Inference",
    "Sibyl: Improving Software Engineering Tools with SMT Selection",
    "HyperAST: Enabling Efficient Analysis of Software Histories at Scale",
    "DeepScaler: Holistic Autoscaling for Microservices Based on Spatiotemporal GNN with Adaptive Graph Learning",
    "On Using GUI Interaction Data to Improve Text Retrieval-based Bug Localization",
    "Repeated Builds During Code Review: An Empirical Study of the OpenStack Community",
    "A Large-Scale Empirical Study on Semantic Versioning in Golang Ecosystem",
    "A Qualitative Study on the Implementation Design Decisions of Developers",
    "Analyzing and Debugging Normative Requirements via Satisfiability Checking",
    "Compiler Testing using Template Java Programs",
    "ReFAIR: Toward a Context-Aware Recommender for Fairness Requirements Engineering",
    "TRIAD: Automated Traceability Recovery based on Biterm-enhanced Deduction of Transitive Links among Artifacts",
    "BSHUNTER: Detecting and Tracing Defects of Bitcoin Scripts",
    "AFour-Year Study of Student Contributions to OSS vs. OSS4SG with a Lightweight Intervention",
    "Do I Belong? Modeling Sense of Virtual Community Among Linux Kernel Contributors",
    "GenderMag Improves Discoverability in the Field, Especially for Women",
    "[Remote] Understanding the topics and challenges of GPU programming by classifying and analyzing Stack Overflow posts",
    "Towards Greener Yet Powerful Code Generation via Quantization: An Empirical Study",
    "How Are Paid and Volunteer Open Source Developers Different? A Study of the Rust Project",
    "Mate! Are You Really Aware? An Explainability-Guided Testing Framework for Robustness of Malware Detectors",
    "Lejacon: A Lightweight and Efficient Approach to Java Confidential Computing on SGX",
    "Static Application Security Testing (SAST) Tools for Smart Contracts: How Far Are We?",
    "Leveraging Practitioners' Feedback to Improve a Security Linter",
    "Automated Black-box Testing of Mass Assignment Vulnerabilities in RESTful APIs",
    "ViaLin: Path-Aware Dynamic Taint Analysis for Android",
    "Improving the Learning of Code Review Successive Tasks with Cross-Task Knowledge Distillation",
    "Boosting the Revealing of Detected Violations in Deep Learning Testing: A Diversity-Guided Method",
    "CARGO: AI-Guided Dependency Analysis for Migrating Monolithic Applications to Microservices Architecture",
    "Reusing Deep Neural Network Models through Model Re-engineering",
    "Using an LLM to Help With Code Understanding",
    "Flexible and Optimal Dependency Management via Max-SMT",
    "Speeding up SMT Solving via Compiler Optimization",
    "Mutation-based Fault Localization of Deep Neural Networks",
    "EndWatch: A Practical Method for Detecting Non-Termination in Real-World Software",
    "The Plastic Surgery Hypothesis in the Era of Large Language Models",
    "Accelerating Continuous Integration with Parallel Batch Testing",
    "Testing the Limits: Unusual Text Inputs Generation for Mobile App Crash Detection with Large Language Model",
    "Generative Type Inference for Python",
    "TraStrainer: Adaptive Sampling for Distributed Traces with System Runtime State",
    "FAIR: Flow Type-Aware Pre-Training of Compiler Intermediate Representations",
    "The Smelly Eight: An Empirical Study on the Prevalence of Code Smells in Quantum Computing",
    "SkCoder: A Sketch-based Approach for Automatic Code Generation",
    "CodeGen4Libs: A Two-Stage Approach for Library-Oriented Code Generation",
    "DeepScaler: Holistic Autoscaling for Microservices Based on Spatiotemporal GNN with Adaptive Graph Learning",
    "Hard to Read and Understand Pythonic Idioms? DeIdiom and Explain Them in Non-Idiomatic Equivalent Code",
    "Has My Release Disobeyed Semantic Versioning? Static Detection Based On Semantic Differencing",
    "Semantic GUI Scene Learning and Video Alignment for Detecting Duplicate Video-based Bug Reports",
    "Developer-Intent Driven Code Comment Generation",
    "Dependency-Induced Waste in Continuous Integration: An Empirical Study of Unused Dependencies in the NPM Ecosystem",
    "Analyzing and Debugging Normative Requirements via Satisfiability Checking",
    "Compiler Testing using Template Java Programs",
    "Detecting Smart Home Automation Application Interferences with Domain Knowledge",
    "Groundhog: An Automated Accessibility Crawler for Mobile Apps",
    "Automatically Detecting Visual Bugs in HTML5 <canvas> Games",
    "Generating Critical Test Scenarios for Autonomous Driving Systems via Influential Behavior Patterns",
    "Causal Relationships and Programming Outcomes: A Transcranial Magnetic Stimulation Experiment",
    "GenderMag Improves Discoverability in the Field, Especially for Women",
    "Property-Based Testing in Practice",
    "Understanding the Impact of APIs Behavioral Breaking Changes on Client Applications",
    "How does Simulation-based Testing for Self-driving Cars match Human Perception?",
    "A Case Study of Developer Bots: Motivations, Perceptions, and Challenges",
    "Compatible Remediation on Vulnerabilities from Third-Party Libraries for Java Projects",
    "[Remote] TransRacer: Function Dependence-Guided Transaction Race Detection for Smart Contracts",
    "Lejacon: A Lightweight and Efficient Approach to Java Confidential Computing on SGX",
    "FuzzSlice: Pruning False Positives in Static Analysis Warnings through Function-Level Fuzzing",
    "Does data sampling improve deep learning-based vulnerability detection? Yeas! and Nays!",
    "Fine-grained Commit-level Vulnerability Type Prediction By CWE Tree Structure",
    "LExecutor: Learning-Guided Execution",
    "Improving the Learning of Code Review Successive Tasks with Cross-Task Knowledge Distillation",
    "Benchmarking Robustness of AI-enabled Multi-sensor Fusion Systems: Challenges and Opportunities",
    "Compressing Pre-trained Models of Code into 3 MB",
    "PyEvolve: Automating Frequent Code Changes in Python ML Systems",
    "[Remote] CodeMark: Imperceptible Watermarking for Code Datasets against Neural Code Completion Models",
    "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    "Understanding and Detecting On-the-Fly Configuration Bugs",
    "ProveNFix: Temporal Property guided Program Repair",
    "PyTy: Repairing Static Type Errors in Python",
    "MuAkka: Mutation Testing for Actor Concurrency in Akka Using Real-World Bugs",
    "HTFuzz: Heap Operation Sequence Sensitive Fuzzing",
    "Generative Type Inference for Python",
    "FAIR: Flow Type-Aware Pre-Training of Compiler Intermediate Representations",
    "Recommending Analogical APIs via Knowledge Graph Embedding",
    "On the Usage of Continual Learning for Out-of-Distribution Generalization in Pre-trained Language Models of Code",
    "Studying and Understanding the Tradeoffs Between Generality and Reduction in Software Debloating",
    "Decomposing Software Verification Using Distributed Summary Synthesis",
    "Only diff is Not Enough: Generating Commit Messages Leveraging Reasoning and Action of Large Language Model",
    "Sibyl: Improving Software Engineering Tools with SMT Selection",
    "HyperAST: Enabling Efficient Analysis of Software Histories at Scale",
    "On Using GUI Interaction Data to Improve Text Retrieval-based Bug Localization",
    "UpCy: Safely Updating Outdated Dependencies",
    "OSSFP: Precise and Scalable C/C++ Third-Party Library Detection using Fingerprinting Functions",
    "Compiler Testing using Template Java Programs",
    "Analyzing and Debugging Normative Requirements via Satisfiability Checking",
    "Detecting Smart Home Automation Application Interferences with Domain Knowledge",
    "SmartCoCo: Checking Comment-code Inconsistency in Smart Contracts via Constraint Propagation and Binding",
    "TRIAD: Automated Traceability Recovery based on Biterm-enhanced Deduction of Transitive Links among Artifacts",
    "Testability Refactoring in Pull Requests: Patterns and Trends",
    "“STILL AROUND”: Experiences and Survival Strategies of Veteran Women Software Developers",
    "A Longitudinal Study of Student Contributions to OSS vs. OSS4SG with a Lightweight Intervention",
    "Property-Based Testing in Practice",
    "Semi-Automatic, Inline and Collaborative Web Page Code Curations",
    "AI-assisted Code Authoring at Scale: Fine-tuning, deploying, and mixed methods evaluation",
    "\"We Feel Like We're Winging It\": A Study on Navigating Open-Source Dependency Abandonment",
    "Static Application Security Testing (SAST) Tools for Smart Contracts: How Far Are We?",
    "Attention! Your Copied Data is Under Monitoring: A Systematic Study of Clipboard Usage in Android Apps",
    "Compatible Remediation on Vulnerabilities from Third-Party Libraries for Java Projects",
    "Silent Vulnerable Dependency Alert Prediction with Vulnerability Key Aspect Explanation",
    "An Empirical Study of Data Disruption by Ransomware Attacks",
    "TAINTMINI: Detecting Flow of Sensitive Data in Mini-Programs with Static Taint Analysis"
]

In [1]:
def remove_sampled_papers_from_pool(current_pool_dict, sampled_papers_dict):
    """
    Remove papers from current_pool_dict that exist in sampled_papers_debug.
    Papers are matched based on their names.
    
    Args:
        current_pool_dict (defaultdict): Dictionary containing papers organized by domain
        sampled_papers_dict (dict): Dictionary containing papers to check against
    """
    # For each domain in sampled_papers_dict
    for domain, sampled_papers in sampled_papers_dict.items():
        # Skip if domain doesn't exist in current_pool_dict
        if domain not in current_pool_dict:
            continue
            
        # Create a set of paper names from sampled papers for efficient lookup
        sampled_paper_names = {paper['name'] for paper in sampled_papers}
        
        # Filter out papers that exist in sampled_papers_dict
        current_pool_dict[domain]['papers'] = [
            paper for paper in current_pool_dict[domain]['papers']
            if paper['name'] not in sampled_paper_names
        ]
        
        # If no papers left in domain, you might want to remove the domain entirely
        if not current_pool_dict[domain]['papers']:
            del current_pool_dict[domain]

In [30]:
def sample_remaining_papers(merged_dict_all):
    """
    Returns randomly sampled papers from domain_ai_se and domain_testing_analysis. These two domains are the only domains where papers still need to be sampled.

    Args:
        merged_dict_all (dict): A dictionary where each domain maps to a dictionary containing a list of papers.

    Returns:
        dict: A dictionary containing only domain_ai_se and domain_testing_analysis, where each domain maps 
             to a list of papers with distinguished papers first, followed by non-distinguished papers.
    """
    sampled_papers = {}
    target_domains = ['domain_dependability_security']

    for domain in target_domains:
        if domain in merged_dict_all:
            # Separate papers into distinguished and non-distinguished
            all_papers = merged_dict_all[domain]['papers']
            distinguished_papers = [paper for paper in all_papers if paper['distinguished']]
            non_distinguished_papers = [paper for paper in all_papers if not paper['distinguished']]
            
            # Randomly shuffle each group separately
            random.shuffle(distinguished_papers)
            random.shuffle(non_distinguished_papers)
            
            # Combine the papers with distinguished papers first
            sampled_papers[domain] = distinguished_papers + non_distinguished_papers

    return sampled_papers

In [3]:
def write_dict_to_csv(paper_dict, filename):
    """
    Writes the dictionary containing papers to a CSV file.

    Args:
        paper_dict (defaultdict): A dictionary containing paper data.
        filename (str): The name of the CSV file to write to.
    """
    # Open the file for writing
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # Write the header row (assuming papers are under "papers" key)
        writer.writerow(['Domain', 'Paper Name', 'Authors', 'Distinguished', 'URL'])

        # Loop through the dictionary and write the paper details to the file
        for domain, data in paper_dict.items():
            if "papers" in data:
                for paper in data["papers"]:
                    writer.writerow([domain, paper["name"], paper["authors"], paper["distinguished"], paper["url"]])

In [13]:
def count_papers_in_dict(paper_dict):
    """
    Counts the number of papers in the dictionary.

    Args:
        paper_dict (dict): A dictionary containing domains and their respective papers.

    Returns:
        int: The total number of papers in the dictionary.
    """
    total_papers = 0
    for domain, papers in paper_dict.items():
        total_papers += len(papers)  # Count the number of papers in each domain
    return total_papers

In [28]:
current_pool_csv_file = 'csv_files/current_papers_pool.csv'
current_papers_pool = parse_papers_csv(current_pool_csv_file)

Debugging:

In [29]:
count_papers(current_papers_pool)

327

In [31]:
#sampled_papers = sample_papers(current_papers_pool)
#write_sampled_papers_to_csv(sampled_papers, 'sampled_papers.csv')

## After multiple rounds of sampling only two domains are missing papers. Sample only papers out of these two domains:
sampled_papers = sample_remaining_papers(current_papers_pool)
write_sampled_papers_to_csv(sampled_papers, 'sampled_papers.csv')

Debugging (count the number of the sampled papers which are to be removed from the pool):

In [32]:
count_papers_in_dict(sampled_papers)

68

In [33]:
remove_sampled_papers_from_pool(current_papers_pool, sampled_papers)

Debugging (count the number of papers that remain in the pool after the removal of the sampled papers):

In [34]:
count_papers(current_papers_pool)

259

In [35]:
write_dict_to_csv(current_papers_pool, 'csv_files/current_papers_pool.csv')