# Sampling of RQs

In [26]:
from collections import defaultdict
import csv
import pprint
import random
import os

In [None]:
def parse_papers_csv(csv_file):
    """
    Parses a CSV file containing information about research papers and organizes it into a structured dictionary.

    Args:
        csv_file (str): The path to the CSV file containing the paper data. The CSV file should have the following columns:
            - 'Domain': The domain or category of the paper.
            - 'Paper Name': The title of the paper.
            - 'Authors': A comma-separated list of authors.
            - 'Distinguished': A string indicating whether the paper is distinguished ('True' or 'False').

    Returns:
        defaultdict: A nested dictionary where each domain key maps to a dictionary containing:
            - 'papers': A list of dictionaries, each representing a paper with the following keys:
                - 'name' (str): The title of the paper.
                - 'authors' (str): A comma-separated string of author names.
                - 'distinguished' (bool): A boolean indicating if the paper is distinguished.
                - 'url' (str): A placeholder string ('No URL available') for the paper's URL.
    """
    merged_dict_all = defaultdict(lambda: {'papers': []})

    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            domain = row['Domain']
            paper_name = row['Paper Name']
            
            authors = [author.strip() for author in row['Authors'].split(',')]
            authors = ', '.join(authors)
            
            distinguished = row['Distinguished'] == 'True'

            paper_entry = {
                'name': paper_name,
                'authors': authors,
                'distinguished': distinguished,
                'url': 'No URL available'
            }

            merged_dict_all[domain]['papers'].append(paper_entry)

    return merged_dict_all

In [None]:
def print_papers_by_domain(merged_dict_all):
    """
    Prints a list of papers organized by domain in a readable format.

    Args:
        merged_dict_all (dict): A dictionary where each key is a domain, and the value is a dictionary containing:
            - 'papers': A list of dictionaries, each representing a paper with the following keys:
                - 'name' (str): The title of the paper.
                - 'authors' (str): A comma-separated string of author names.
                - 'distinguished' (bool): A boolean indicating if the paper is distinguished.
                - 'url' (str): A placeholder string for the paper's URL.
    """
    # Iterate over each domain in merged_dict_all
    for domain, domain_data in merged_dict_all.items():
        print(f"Domain: {domain}")
        # Iterate over the list of papers in each domain
        for paper in domain_data['papers']:
            print(f"  Paper Name: {paper['name']}")
            #print(f"    Authors: {paper['authors']}")
            print(f"    Distinguished: {paper['distinguished']}")
            #print(f"    URL: {paper['url']}")
        print("\n")  # Print a newline between domains

In [23]:
csv_file = 'csv_files/papers_by_domain.csv'
merged_dict_all = parse_papers_csv(csv_file)

In [24]:
print_papers_by_domain(merged_dict_all)

Domain: domain_ai_se
  Paper Name: One Adapter for All Programming Languages? Adapter Tuning for Multilingual Tasks in Software Engineering
    Distinguished: False
  Paper Name: CCRep: Learning Code Change Representations via Pre-Trained Code Model and Query Back
    Distinguished: False
  Paper Name: Keeping Pace with Ever-Increasing Data: Towards Continual Learning of Code Intelligence Models
    Distinguished: False
  Paper Name: Automating Code-Related Tasks Through Transformers: The Impact of Pre-training
    Distinguished: False
  Paper Name: Log Parsing with Prompt-based Few-shot Learning
    Distinguished: False
  Paper Name: Retrieval-Based Prompt Selection for Code-Related Few-Shot Learning
    Distinguished: False
  Paper Name: An Empirical Study of Pre-Trained Model Reuse in the Hugging Face Deep Learning Model Registry
    Distinguished: False
  Paper Name: ContraBERT: Enhancing Code Pre-trained Models via Contrastive Learning
    Distinguished: False
  Paper Name: Are Hu

# Sampling of the Papers

## Sample 6 papers per domain (3 distinguished and 3 not distinguished)

In [None]:
def sample_papers(merged_dict_all):
    """
    Randomly samples up to 6 papers per domain from the given dataset.

    Args:
        merged_dict_all (dict): A dictionary where each domain maps to a dictionary containing a list of papers.

    Returns:
        dict: A dictionary with the same domain keys, where each domain maps to a list of up to 6 sampled papers 
              (3 distinguished and 3 non-distinguished, if available).
    """
    sampled_papers = {}

    # Iterate over each domain in merged_dict_all
    for domain, domain_data in merged_dict_all.items():
        # Separate papers into distinguished and non-distinguished lists
        distinguished_papers = [paper for paper in domain_data['papers'] if paper['distinguished']]
        non_distinguished_papers = [paper for paper in domain_data['papers'] if not paper['distinguished']]

        # Sample 3 distinguished papers and 3 non-distinguished papers (if possible)
        sampled_distinguished = random.sample(distinguished_papers, min(3, len(distinguished_papers)))
        sampled_non_distinguished = random.sample(non_distinguished_papers, min(3, len(non_distinguished_papers)))

        # Combine the sampled papers
        sampled_papers[domain] = sampled_distinguished + sampled_non_distinguished

    return sampled_papers


In [None]:
def write_sampled_papers_to_csv(sampled_papers, filename='sampled_papers.csv'):
    """
    Writes sampled papers data to a CSV file.

    Args:
        sampled_papers (dict): A dictionary where each key is a domain, and the value is a list of sampled paper dictionaries.
        filename (str, optional): The name of the CSV file to write to. Defaults to 'sampled_papers.csv'.

    Behavior:
        - Writes a header row to the CSV file.
        - Iterates through the sampled papers and writes each paper's details (domain, name, authors, distinguished status, and URL) as a row.
    """
    os.makedirs('csv_files', exist_ok=True)
    filepath = os.path.join('csv_files', filename)

    # Define the header for the CSV
    header = ['Domain', 'Paper Name', 'Authors', 'Distinguished', 'URL']

    # Open the CSV file in write mode
    with open(filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)

        # Iterate over the sampled papers and write each one to the CSV
        for domain, papers in sampled_papers.items():
            for paper in papers:
                writer.writerow([domain, paper['name'], paper['authors'], paper['distinguished'], paper['url']])

In [20]:
sampled_papers = sample_papers(merged_dict_all)
for domain, papers in sampled_papers.items():
    print(f"Domain: {domain}")
    for paper in papers:
        print(f"  Paper Name: {paper['name']}")
        #print(f"    Authors: {paper['authors']}")
        print(f"    Distinguished: {paper['distinguished']}")
        #print(f"    URL: {paper['url']}")
    print("\n")  

Domain: domain_ai_se
  Paper Name: Benchmarking Robustness of AI-enabled Multi-sensor Fusion Systems: Challenges and Opportunities
    Distinguished: True
  Paper Name: Can Machine Learning Pipelines Be Better Configured?
    Distinguished: True
  Paper Name: CARGO: AI-Guided Dependency Analysis for Migrating Monolithic Applications to Microservices Architecture
    Distinguished: True
  Paper Name: Uncovering the Causes of Emotions in Software Developer Communication Using Zero-shot LLMs
    Distinguished: False
  Paper Name: Natural Is The Best: Model-Agnostic Code Simplification for Pre-trained Large Language Models
    Distinguished: False
  Paper Name: Patching Weak Convolutional Neural Network Models through Modularization and Composition
    Distinguished: False


Domain: domain_testing_analysis
  Paper Name: Mutation-based Fault Localization of Deep Neural Networks
    Distinguished: True
  Paper Name: PHYFU: Fuzzing Modern Physics Simulation Engines
    Distinguished: True
  P

In [28]:
write_sampled_papers_to_csv(sampled_papers, 'sampled_papers.csv')