In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv
import json
import PyPDF2

In [None]:
import os
import json
import PyPDF2

# Function to extract PDFs from a folder
def extract_pdfs_from_folder(pdf_folder):
    pdf_files = []
    for file_name in os.listdir(pdf_folder):
        if file_name.endswith('.pdf'):  
            pdf_files.append(os.path.join(pdf_folder, file_name))
    return pdf_files

# Function to extract the title from the PDF (first few lines of the first page)
def extract_title_from_pdf(pdf_text):
    # Split the text into lines
    lines = pdf_text.splitlines()
    
    # Filtering out empty lines and return the first non-empty line as the title
    for line in lines:
        if line.strip():  # If the line is not empty
            return line.strip()
    return "Unknown Title"  # Fallback if no title found

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
    try:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pdf_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page_text = pdf_reader.pages[page_num].extract_text()
            if page_text:
                pdf_text += page_text
        return pdf_text
    except KeyError as e:
        print(f"Error processing {pdf_file}: {e}")
        return None  # Return None if there's an issue with the PDF
    except Exception as e:
        print(f"Unexpected error processing {pdf_file}: {e}")
        return None  # Handle any other exceptions

# Path to the folder containing PDF files
pdf_folder = r"/path/to/pdf/folder"


# Load PDF files and extract content
papers = []
pdf_files = extract_pdfs_from_folder(pdf_folder)
for pdf_file in pdf_files:
    text = extract_text_from_pdf(pdf_file)
    if text:
        title = extract_title_from_pdf(text)  # Try to extract title from the content
        papers.append({"title": title, "content": text})
        print(f"Extracted text and title from: {pdf_file}")
    else:
        print(f"Failed to extract text from: {pdf_file}")

# Save extracted text to a JSONL file
with open('extracted_test_papers_new_NEWEST.jsonl', 'w', encoding='utf-8') as f:
    for paper in papers:
        # Clean up surrogate pairs
        safe_text = json.dumps(paper, ensure_ascii=False).encode('utf-8', 'ignore').decode('utf-8')
        f.write(safe_text + "\n")

print("Extracted text from all PDFs has been saved")


In [None]:
pip install tiktoken


In [None]:
def load_papers_from_jsonl(file_path):
    papers = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    paper = json.loads(line.strip())
                    papers.append(paper)
                except json.JSONDecodeError:
                    print(f"Skipping a line due to JSON decoding error.")
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except Exception as e:
        print(f"An unexpected error occurred while reading the file: {e}")

    print(f"Loaded papers from {file_path}.")
    return papers

jsonl_file_path = "extracted_test_papers_new_NEWEST.jsonl"

# Load papers
papers = load_papers_from_jsonl(jsonl_file_path)

# Display loaded papers (Optional)
for i, paper in enumerate(papers[:5]):  # Limit display to the first 5 papers for readability
    print(f"Paper {i+1}:")
    print(f"Title: {paper.get('title', 'No title provided')}")
    print("Content:")
    print(paper.get('content', 'No content provided')[:500])  # Print first 500 characters of content
    print("-" * 50)  # Separator


In [None]:
import json
from transformers import GPT2Tokenizer

def load_papers_from_jsonl(file_path):
    papers = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                papers.append(json.loads(line.strip()))
    except Exception as e:
        print(f"Error loading or processing file: {e}")
    return papers

def calculate_total_tokens(papers):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # Loading GPT-2 tokenizer
    total_tokens = 0
    for paper in papers:
        content = paper.get('content', '')
        tokens = tokenizer(content, add_special_tokens=False)['input_ids']  # Tokenize without adding special tokens
        total_tokens += len(tokens)
        print(f"Paper: {paper['title']} - Tokens: {len(tokens)}")
    return total_tokens

# Specify the path to your JSONL file
jsonl_file_path = "extracted_test_papers_new_NEWEST.jsonl"

# Load papers
papers = load_papers_from_jsonl(jsonl_file_path)

# Calculate total tokens
total_tokens = calculate_total_tokens(papers)
print(f"Total number of tokens in the dataset: {total_tokens}")


In [None]:

import os
import json
from openai import OpenAI
from dotenv import load_dotenv



load_dotenv("api_key.env")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

acm_ccs_structure = [
    {
        "high_level_domain": "General and reference",
        "subdomains": [
            {
                "subdomain": "Document types",
                "examples": [
                    "Surveys and overviews", "Reference works", "General conference proceedings",
                    "Biographies", "General literature", "Computing standards, RFCs and guidelines"
                ]
            },
            {
                "subdomain": "Cross-computing tools and techniques",
                "examples": [
                    "Reliability", "Empirical studies", "Measurement", "Metrics",
                    "Evaluation", "Experimentation", "Estimation", "Design",
                    "Performance", "Validation", "Verification"
                ]
            }
        ]
    },
    {
        "high_level_domain": "Computer systems organization",
        "subdomains": [
            {"subdomain": "Architectures", "examples": ["Serial architectures", "Parallel architectures", "Distributed architectures", "Other architectures"]},
            {"subdomain": "Embedded and cyber-physical systems", "examples": ["Sensor networks", "Robotics", "Sensors and actuators", "System on a chip", "Embedded systems"]},
            {"subdomain": "Real-time systems", "examples": ["Real-time operating systems", "Real-time languages", "Real-time system specification", "Real-time system architecture"]},
            {"subdomain": "Dependable and fault-tolerant systems and networks", "examples": ["Reliability", "Availability", "Maintainability and maintenance", "Processors and memory architectures", "Secondary storage organization", "Redundancy", "Fault-tolerant network topologies"]}
        ]
    },
    {
        "high_level_domain": "Networks",
        "subdomains": [
            {"subdomain": "Network architectures", "examples": ["Network design principles", "Programming interfaces"]},
            {"subdomain": "Network protocols", "examples": ["Network protocol design", "Protocol correctness", "Link-layer protocols", "Network layer protocols", "Transport protocols", "Session protocols", "Presentation protocols", "Application layer protocols", "OAM protocols", "Cross-layer protocols", "Network File System (NFS) protocol"]},
            {"subdomain": "Network components", "examples": ["Intermediate nodes", "Physical links", "Middle boxes / network appliances", "End nodes", "Wireless access points, base stations and infrastructure", "Logical nodes"]},
            {"subdomain": "Network algorithms", "examples": ["Data path algorithms", "Control path algorithms", "Network economics"]},
            {"subdomain": "Network performance evaluation", "examples": ["Network performance modeling", "Network simulations", "Network experimentation", "Network performance analysis", "Network measurement"]},
            {"subdomain": "Network properties", "examples": ["Network security", "Network range", "Network structure", "Network dynamics", "Network reliability", "Network mobility", "Network manageability", "Network privacy and anonymity"]},
            {"subdomain": "Network services", "examples": ["Naming and addressing", "Cloud computing", "Location based services", "Programmable networks", "In-network processing", "Network management", "Network monitoring"]},
            {"subdomain": "Network types", "examples": ["Network on chip", "Home networks", "Storage area networks", "Data center networks", "Wired access networks", "Cyber-physical networks", "Mobile networks", "Overlay and other logical network structures", "Wireless access networks", "Ad hoc networks", "Public Internet", "Packet-switching networks"]}
        ]
    },
    {
        "high_level_domain": "Software and its engineering",
        "subdomains": [
            {"subdomain": "Software organization and properties", "examples": ["Contextual software domains", "Software system structures", "Software functional properties", "Extra-functional properties"]},
            {"subdomain": "Software notations and tools", "examples": ["General programming languages", "Formal language definitions", "Compilers", "Context specific languages", "System description languages", "Development frameworks and environments", "Software configuration management and version control systems", "Software libraries and repositories", "Software maintenance tools"]},
            {"subdomain": "Software creation and management", "examples": ["Designing software", "Software development process management", "Software development techniques", "Software verification and validation", "Software post-development issues", "Collaboration in software development", "Search-based software engineering"]}
        ]
    },
    {
        "high_level_domain": "Information systems",
        "subdomains": [
            {"subdomain": "Data management systems", "examples": ["Database design and models", "Data structures", "Database management system engines", "Query languages", "Database administration", "Information integration", "Middleware for databases"]},
            {"subdomain": "Information storage systems", "examples": ["Information storage technologies", "Record storage systems", "Storage replication", "Storage architectures", "Storage management"]},
            {"subdomain": "Information systems applications", "examples": ["Enterprise information systems", "Collaborative and social computing systems and tools", "Spatial-temporal systems", "Decision support systems", "Mobile information processing systems", "Process control systems", "Multimedia information systems", "Data mining", "Digital libraries and archives", "Computational advertising", "Computing platforms"]},
            {"subdomain": "World Wide Web", "examples": ["Web searching and information discovery", "Online advertising", "Web mining", "Web applications", "Web interfaces", "Web services", "Web data description languages"]},
            {"subdomain": "Information retrieval", "examples": ["Document representation", "Information retrieval query processing", "Users and interactive retrieval", "Retrieval models and ranking", "Retrieval tasks and goals", "Evaluation of retrieval results", "Search engine architectures and scalability", "Specialized information retrieval"]}
        ]
    },
    {
        "high_level_domain": "Security and privacy",
        "subdomains": [
            {"subdomain": "Cryptography", "examples": ["Key management", "Public key (asymmetric) techniques", "Symmetric cryptography and hash functions", "Cryptanalysis and other attacks", 
            "Information-theoretic techniques", "Mathematical foundations of cryptography"]},
            {"subdomain": "Formal methods and theory of security", "examples": ["Trust frameworks", "Security requirements", "Formal security models", "Logic and verification"]},
            {"subdomain": "Security services", "examples": ["Authentication", "Access control", "Pseudonymity, anonymity and untraceability", "Privacy-preserving protocols", 
            "Digital rights management", "Authorization"]},
            {"subdomain": "Intrusion/anomaly detection and malware mitigation", "examples": ["Malware and its mitigation", "Intrusion detection systems", "Social engineering attacks"]},
            {"subdomain": "Security in hardware", "examples": ["Tamper-proof and tamper-resistant designs", "Embedded systems security", "Hardware security implementation", 
            "Hardware attacks and countermeasures", "Hardware reverse engineering"]},
            {"subdomain": "Systems security", "examples": ["Operating systems security", "Browser security", "Distributed systems security", "Information flow control", 
            "Denial-of-service attacks", "Firewalls", "Vulnerability management", "File system security"]},
            {"subdomain": "Network security", "examples": ["Security protocols", "Web protocol security", "Mobile and wireless security", "Denial-of-service attacks", "Firewalls"]},
            {"subdomain": "Database and storage security", "examples": ["Data anonymization and sanitization", "Management and querying of encrypted data", 
            "Information accountability and usage control", "Database activity monitoring"]},
            {"subdomain": "Software and application security", "examples": ["Software security engineering", "Web application security", "Social network security and privacy",
            "Domain-specific security and privacy architectures", "Software reverse engineering"]},
            {"subdomain": "Human and societal aspects of security and privacy", "examples": ["Economics of security and privacy", "Social aspects of security and privacy",
            "Privacy protections", "Usability in security and privacy"]}
        ]
    },
    {
        "high_level_domain": "Human-centered computing",
        "subdomains": [
            {"subdomain": "Human computer interaction (HCI)", "examples": ["HCI design and evaluation methods", "Interaction paradigms", "Interaction devices", "HCI theory, concepts and models", "Interaction techniques", "Interactive systems and tools", "Empirical studies in HCI"]},
            {"subdomain": "Interaction design", "examples": ["Interaction design process and methods", "Interaction design theory, concepts and paradigms", "Empirical studies in interaction design", "Systems and tools for interaction design"]},
            {"subdomain": "Collaborative and social computing", "examples": ["Collaborative and social computing theory, concepts and paradigms", "Collaborative and social computing design and evaluation methods", "Collaborative and social computing systems and tools", "Empirical studies in collaborative and social computing", "Collaborative and social computing devices"]},
            {"subdomain": "Ubiquitous and mobile computing", "examples": ["Ubiquitous and mobile computing theory, concepts and paradigms", "Ubiquitous and mobile computing systems and tools", "Ubiquitous and mobile devices", "Ubiquitous and mobile computing design and evaluation methods", "Empirical studies in ubiquitous and mobile computing"]},
            {"subdomain": "Visualization", "examples": ["Visualization techniques", "Visualization application domains", "Visualization systems and tools", "Visualization theory, concepts and paradigms", "Empirical studies in visualization", "Visualization design and evaluation methods"]},
            {"subdomain": "Accessibility", "examples": ["Accessibility theory, concepts and paradigms", "Empirical studies in accessibility", "Accessibility design and evaluation methods", "Accessibility technologies", "Accessibility systems and tools"]}
        ]
    },
    {
        "high_level_domain": "Computing methodologies",
        "subdomains": [
            {"subdomain": "Symbolic and algebraic manipulation", "examples": ["Symbolic and algebraic algorithms", "Computer algebra systems", "Representation of mathematical objects"]},
            {"subdomain": "Parallel computing methodologies", "examples": ["Parallel algorithms", "Parallel programming languages"]},
            {"subdomain": "Artificial intelligence", "examples": ["Natural language processing", "Knowledge representation and reasoning", "Planning and scheduling", "Search methodologies", "Control methods", "Philosophical/theoretical foundations of artificial intelligence", "Distributed artificial intelligence", "Computer vision"]},
            {"subdomain": "Machine learning", "examples": ["Learning paradigms", "Learning settings", "Machine learning approaches", "Machine learning algorithms", "Cross-validation"]},
            {"subdomain": "Modeling and simulation", "examples": ["Model development and analysis", "Simulation theory", "Simulation types and techniques", "Simulation support systems", "Simulation evaluation"]},
            {"subdomain": "Computer graphics", "examples": ["Animation", "Rendering", "Image manipulation", "Graphics systems and interfaces", "Image compression", "Shape modeling"]},
            {"subdomain": "Distributed computing methodologies", "examples": ["Distributed algorithms", "Distributed programming languages"]},
            {"subdomain": "Concurrent computing methodologies", "examples": ["Concurrent programming languages", "Concurrent algorithms"]}
        ]
    },
    {
        "high_level_domain": "Social and professional topics",
        "subdomains": [
            {"subdomain": "Professional topics", "examples": ["Computing industry", "Management of computing and information systems", "History of computing", "Computing education", "Computing and business", "Computing profession"]},
            {"subdomain": "Computing / technology policy", "examples": ["Intellectual property", "Privacy policies", "Censorship", "Surveillance", "Commerce policy", "Network access control", "Computer crime", "Government technology policy", "Medical information policy"]},
            {"subdomain": "User characteristics", "examples": ["Race and ethnicity", "Religious orientation", "Gender", "Sexual orientation", "People with disabilities", "Geographic characteristics", "Cultural characteristics", "Age"]}
        ]
    }
]

acm_json = json.dumps(acm_ccs_structure, indent=4)


In [None]:
def build_sub_subdomain_map(acm_structure):
    mapping = {}
    for domain in acm_structure:
        high_domain = domain["high_level_domain"]
        for sub in domain["subdomains"]:
            subdomain = sub["subdomain"]
            for example in sub.get("examples", []):
                mapping[example.lower()] = {"high_level_domain": high_domain, "subdomain": subdomain}
    return mapping


In [None]:
import json

def generate_system_prompt(paper, task, dataset_names):
    title = paper['title']
    content = paper['content']

    # Extract title first and then reuse in other tasks
    if task == "title":
        return f"""
        You are tasked with extracting the title of the provided cybersecurity paper.
        
        Guidelines and Rules:
        \t1. The title is often at the top of the first page.
        \t2. Extract the title in its entirety.
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "title": "Title of the paper here"
        }}

        Your response: """

    elif task == "authors_name":
        return f"""
        You are tasked with extracting the authors' names from the provided cybersecurity paper.
        
        Guidelines and Rules:
        \t1. The authors' names are usually listed directly below the title.
        \t2. Extract all the authors, separated by commas.
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "authors": "Comma-separated list of authors' names here"
        }}

        Your response: """

    elif task == "conference_name":
        return f"""
        You are tasked with extracting the conference name where the paper was presented.
        
        Guidelines and Rules:
        \t1. The conference name is usually found at the top or bottom of the first page.
        \t2. Use the short form (USS, NDSS, ACSAC, SP, CCS) if applicable.
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "conference": "Short form of conference name (USS, NDSS, ACSAC, SP, CCS)"
        }}

        Your response: """

    elif task == "published_year":
        return f"""
        You are tasked with extracting the year of publication from the provided cybersecurity paper.
        
        Guidelines and Rules:
        \t1. The year of publication is usually found near the conference name or at the bottom of the first page.
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "year": "Year of publication here"
        }}
        
        Your response: """

    elif task == "school_institution":
        return f"""
        You are tasked with extracting the school or institution name(s) associated with the authors of the provided cybersecurity paper.
        
        Guidelines and Rules:
        \t1. The institution or school is often listed near the authors' names, either directly below or in the footer of the first page.
        \t2. Extract all institutions mentioned, separated by commas if there are multiple.
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "school": "Comma-separated list of schools/institutions"
        }}
        
        Your response: """
        
    elif task == "domain":
        acm_json = json.dumps(acm_ccs_structure, indent=4)

        subdomain_map = {}
        for domain in acm_ccs_structure:
            for sub in domain["subdomains"]:
                for example in sub.get("examples", []):
                    subdomain_map[example] = {
                        "high_level_domain": domain["high_level_domain"],
                        "subdomain": sub["subdomain"]
                    }
        flat_map_json = json.dumps(subdomain_map, indent=4)

        return f"""
        You are tasked with identifying the **ACM Computing Classification System (CCS)** research domains for the following cybersecurity paper titled: \"{title}\".

        Your job is to return the correct **\"high_level_domain\"** and its corresponding **\"subdomain\"**, based on the paper’s content and the ACM CCS structure
        provided below.
        Guideline and Rules:

        t1. **Strictly follow the ACM CCS structure** below.
        \t- For each domain, you MUST use:
        \t- **high_level_domain**: One of the official ACM categories (e.g., \"Security and privacy\").
        \t- **subdomain**: Choose ONLY from the subdomains under the high-level domain as defined in the ACM structure.
        t2.  Never invent sub-subdomains. The ACM CCS structure you are given ends at the subdomain level.
        \t- Even if the paper mentions a sub-subdomain (like \"Usability in security and privacy\"), your job is to **map it to the correct subdomain**
        from the ACM structure.
        \t- Use the mapping guide provided below under <SUB_SUBDOMAIN_MAPPING> to assist with this mapping task.
        \t3. You are allowed to return **multiple (high_level_domain, subdomain)** pairs if the paper covers more than one domain.
        \t4. Special Rule:
        \t- If the conference is **NDSS** **Usenix** or **IEEE S&P**, always set **high_level_domain** as \"Security and privacy\" (if relevant) 
        and identify the correct **subdomain** based on content. 
        \t- For papers from **NDSS** determine the **subdomain** by reading the paper content, typically falling under "Network security" or "Systems security"
        depending on the focus. 

        Json Output Examples:
        Example #1 : For the paper titled **"(Un)informed Consent: Studying GDPR Consent Notices in the Field"** with the following CCS concepts:

        • Security and privacy → Usability in security and privacy
        • Human-centered computing → Empirical studies in interaction design
        • Social and professional topics → Government technology policy

        You should respond with:

        ```json
        [
          {{
            "high_level_domain": "Security and privacy",
            "subdomain": "Human and societal aspects of security and privacy"
          }},
        {{
            "high_level_domain": "Human-centered computing",
            "subdomain": "Interaction design"
          }},
          {{
          "high_level_domain": "Social and professional topics",
            "subdomain": "Computing / technology policy"
          }}
        ]
        ```

        Here is the full ACM CCS structure:
        <ACM_CCS_START>
        {acm_json}
        <ACM_CCS_END>

        Sub-subdomain mapping to subdomains:
        <SUB_SUBDOMAIN_MAPPING>
        {flat_map_json}
        <SUB_SUBDOMAIN_MAPPING_END>

        Start of Paper Content:
        {content}
        End of Paper Content:

        Your response:"""

    elif task == "dataset_name":
        datasets = [
            {
                "unique_id": "null",  # Set to null if not found in existing datasets
                "dataset_name": "Name of the first dataset you find",
                "contributors": "Comma separated string of contributors names for the first dataset you find",
                "doi": "DOI for the first dataset you find. If not available, this should be an empty string.",
                "url": "URL link to the first dataset you find. If not available, this should be an empty string.",
            },
            {
                "unique_id": "null",  # Set to null if not found in existing datasets
                "dataset_name": "Name of the second dataset you find",
                "contributors": "Comma separated string of contributors names for the second dataset you find",
                "doi": "DOI for the second dataset you find. If not available, this should be an empty string.",
                "url": "URL link to the second dataset you find. If not available, this should be an empty string.",
            }
        ]
        datasets_json = json.dumps({"datasets": datasets}, indent=4)
        
        return f"""
        You are tasked with identifying and extracting datasets from the cybersecurity paper titled "{title}".
        
        Guidelines and Rules:
        **STRICTLY FOLLOW ALL GUIDELINES**
        \t1.**Definition of a Dataset**:
        \t- A dataset is a named collection of data used for experiments, evaluation, training, testing, or comparison.
        \t- Examples: CICIDS2017, UNSW-NB15, CAIDA, Alexa top 1 million, HDFS, etc. In one paper they can use as much dataset they want, 
        for instance; if they have used 10 dataset so return all 10 dataset in output.
        \t- Custom-created datasets by the authors also count if explicitly mentioned as such.
        \t- Datasets can be mentioned explicitly by name (e.g., "We use UNSW-NB15") or implicitly (e.g., "we use the dataset from [25]" 
        if reference [25] clearly points to a dataset).

        \t2. **Be Comprehensive & Systematic:**
        \t- Carefully read the entire paper content (including references and methodology sections).
        \t- Identify every dataset mentioned, not just the first few. If you find 10 datasets, list all 10.
        \t- If the same dataset is mentioned multiple times under slightly different names (e.g., "HDFS dataset", "HDFS logs"), consider them 
        as referring to the same dataset.
        
        \t3. **Real Example from a Paper**:
        Consider the ACM CCS paper "Recompose Event Sequences vs. Predict Next Events: A Novel Anomaly Detection Approach for Discrete Event Logs"
        as an example:
        \t- Introduction: "DabLog achieves 97.18% and 80.25% F1 scores in evaluation upon HDFS system logs and UNSW-NB15 traffic logs..."
        \t- Motivation section: "Both methods were evaluated upon the same HDFS dataset [38, 39]..."
        \t- Evaluation section: "We evaluate DabLog with two datasets: UNSW-NB15 traffic logs [29] and HDFS console logs [39]..."
        
        From these mentions, they clearly identify two datasets:
        \t- "HDFS"
        \t- "UNSW-NB15"
        
        In such a scenario, both "HDFS" and "UNSW-NB15" must be returned.
        
        \t4. **Consider Reference-Based Mentions**:
        \t- If the paper references a dataset indirectly, for example, "the same HDFS dataset [38, 39]," then check references. If these references are known sources 
        for the HDFS dataset, include it.
        \t- For Example: in the paper "DoubleX: Statically Detecting Vulnerable Data Flows" author(s) have clearly mentioned 
        "To evaluate DoubleX false negatives, we consider the dataset of vulnerable extensions released by Somé with EmPoWeb. His paper [72] provides a list of extension 
        IDs and corresponding vulnerabilities. Of the 171 Chrome extensions he reported as vulnerable in 2019, 82 still existed on March 16, 2021." So which mean they 
        used this **Chrome extensions dataset** for DoubleX evaluation.
        \t5. **No Guessing or Inferring**:
        
        \t- Do not guess or infer a dataset if it's not explicitly mentioned.
        \t- Attacks, vulnerabilities, software tools, protocols, or platforms are not datasets.
        \t- If after thoroughly reviewing the paper and references you find no dataset mentioned, return 'null':
        
         {{
           "datasets": null
         }}

        \t6.**Do not confuse datasets with other elements**:
        \t- Vulnerability Codes: These are vulnerability codes, so be vigilant. Examples include "CVE-2019-14815", "CVE-2016-4997", and "CVE-2017-9074". Be vigilant with this information, For Example: In the paper "Automated Bug Hunting With Data-Driven Symbolic RootCause Analysis" authors haven't used any dataset, instead CVE (Common Vulnerabilities and Exposures) is used as part of the analysis, particularly focusing on specific vulnerabilities and their contexts. However, CVE is not treated as a "dataset" in the conventional sense, as it serves more as a standardized catalog for identifying known security vulnerabilities. So return this paper's output as 'null' .
        \t\t- Again CVE are not 'datasets', A CVE (Common Vulnerabilities and Exposures) is a publicly disclosed cybersecurity vulnerability or exposure in a software or hardware system. Each CVE is assigned a unique identifier (CVE ID) and is documented in a centralized database to help organizations track, assess, and address security flaws.
        \t- Attacks: These are attack techniques, not datasets. Examples include "SQL injection", "DDoS", and "Phishing".
        \t- Bugs: These represent software flaws or defects. Examples include "software flaws" and "defects".
        \t- Kernel Modules: These are components of the OS, not datasets. Examples include "ipv6.ko" and "nf_tables.ko".
        \t- Network Protocols: These are communication protocols. Examples include "TCP", "UDP", "IPv4", and "IPv6".
        \t- Software Libraries or Packages: These are tools or resources, not datasets. Examples include "libc.so" and "openssl".
        \t- Standalone Applications and Benchmark Suites: SPEC CPU2006, NGINX, and PostgreSQL are not software libraries or packages.
        \t\t- SPEC CPU2006 is a benchmark suite used to evaluate CPU and memory performance across standardized tasks, primarily for research and testing purposes. Fo example in the paper "VIP: Safeguard Value Invariant Property for Thwarting Critical Memory Corruption Attacks", no dataset is used , which mean you will return "null" output, don't consider **SPEC CPU2006** as a dataset.
        \t\t- NGINX is a web server application commonly used to handle HTTP requests, serve static content, and balance load across servers.
        \t\t- PostgreSQL is a standalone database management system (DBMS) that manages data storage, retrieval, and complex querying.
        \t\t- ObliviSync is a secure file synchronization and backup system based on write-only ORAM techniques. It evaluates performance using realistic file size distributions without relying on traditional datasets.
        \t\t\t- Example:The paper "ObliviSync: Practical Oblivious File Backup and Synchronization" evaluates a system for secure file synchronization and backup but does not rely on traditional datasets. So technically, they haven't used any dataset, so it should return 'null'.
        \t- Permissioned Distributed Ledger Platform: Corda is a distributed ledger platform developed by R3 for businesses, focusing on privacy, efficiency, and regulatory compliance. Unlike public blockchains, Corda uses a permissioned network, ensuring that only authorized parties can participate and view transactions. Corda achieves privacy through point-to-point communication and a unique notary system that prevents double-spending without broadcasting transactions. Its modular design supports smart contracts and can be tailored to different industries, making it suitable for applications in finance, healthcare, supply chain, and more. 
        \t- Artifact: sometimes authors release their own artifact and shared it, don't confuse it with dataset.
        \t- Raspberry Pi: Raspberry Pi is a small, affordable computer, often used for educational purposes, DIY projects, and experiments in computing, robotics, and IoT, not a dataset. For example in the paper "Indistinguishability Prevents Scheduler Side Channels in Real-Time Systems" no dataset is used , which mean you will return "null" output, don't consider **Raspberry Pi** as a dataset.
        
        \t7. For each dataset, identify:
        \t- **Name** of the dataset.
        \t- **Contributors** (authors or creators).
        \t- **DOI** The DOI of the dataset (if available)
        \t- **URL** The URL link of the dataset (if available)
        \t\t- Look for DOIs and URLs in the reference section, especially for **custom-created but public** datasets. In these cases, the contributors are usually the authors of the paper, and they often explicitly mention sharing links to platforms like GitHub or other repositories. Be sure to check for these, but do not include any random GitHub or other links—only include links where the authors explicitly state that they have shared their datasets. Remain vigilant in confirming this information. 
        \t\t- If no dataset is found, return:
         
         {{
           "datasets": null
         }}

        
        - **Null Cases** Examples (**Check these example thoroughly before returning the ouput for same paper(s) mentioned below**):

        Example 1: The paper "Indistinguishability Prevents Scheduler Side Channels in Real-Time Systems" is not a 
        dataset-related paper, which mean the haven't used any dataset, so return the output as 'null'. And other tasks like 
        **dataset_analysis_combined** , **dataset_categories** and **dataset_usage** will be 'null' too.
        Json Output:
        
        {{
           "datasets": null
         }}
    
        Example 2: In the paper "ZKCPlus: Optimized Fair-exchange Protocol Supporting Practical and Flexible Data Exchange", which mean the haven't used any dataset, so return the output as 'null'. And other tasks like **dataset_analysis_combined** , **dataset_categories** and **dataset_usage** will be 'null' too.
        
        Json Output:
          
        {{
           "datasets": null
         }}

         Example 3: In the paper "DPGen: Automated Program Synthesis for Differential Privacy" , which mean the haven't used any dataset, so return the output as 'null'. And other tasks like **dataset_analysis_combined** , **dataset_categories** and **dataset_usage** will be 'null' too.
         
         Json Output:
           
         {{
           "datasets": null
         }}
         
         Example 4: In the paper "A Security Framework for Distributed Ledgers" , which mean the haven't used any dataset, so return the output as 'null'. And other tasks like **dataset_analysis_combined** , **dataset_categories** and **dataset_usage** will be 'null' too.
         
         Json Output:
           
         {{
           "datasets": null
         }}

         
         - Other Examples:
         
         Example 1: In the paper "WristPrint: Characterizing User Re-identification Risks from Wrist-worn Accelerometry Data", the author used two **public** datasets like "mORAL" and "WISDM".
 {{
        "datasets": [
            {{
                "unique_id": "null",
                "dataset_name": "mORAL",
                "contributors": "Sayma Akther, Nazir Saleheen, Shahin Alan Samiei, Vivek Shetty, Emre Ertin, Santosh Kumar",
                "doi": "",
                "url": ""
            }},
            {{
                "unique_id": "null",
                "dataset_name": "WISDM",
                "contributors": "Gary M Weiss",
                "doi": "",
                "url": "https://www.uci.edu/ml/datasets/wisdm+smartphone+and+smartwatch+activity+and+biometrics+dataset"
            }}
        ]
    }}

           Example 4: In the "A Lightweight IoT Cryptojacking Detection Mechanism in Heterogeneous Smart Home Networks"  a **public **dataset like "Network traffic for machine learning classification" or "Benign dataset" and **Custom-created datasets but public** dataset are used.
    {{
        "datasets": [
            {{
                "unique_id": "null",
                "dataset_name": "Iot cryptojacking",
                "contributors": "Ege Tekiner, Abbas Acar, A. Selcuk Uluagac,
                "doi": "",
                "url": "https://github.com/cslfiu/IoTCryptojacking"
            }},
            {{
                "unique_id": "null",
                "dataset_name": "Benign Dataset",
                "contributors": "Víctor Labayen Guembe, Eduardo Magaña, Daniel Morató, Mikel Izal",
                "doi": "10.17632/5pmnkshffm.1",
                "url": "https://data.mendeley.com/datasets/5pmnkshffm/1"
            }}
        ]
    }}
        
        Here is the list of existing datasets:
        <Existing dataset(s) start>
        {existing_datasets}
        <Existing dataset(s) stop>

        Here is the paper:
        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your output must be returned in valid JSON format:

        {datasets_json}
    
        Your response: """

    
    elif task == "dataset_analysis_combined":
        
        return f"""

        You are tasked with identifying the **availability**, **labeling_type**, and **dataset_type** for each dataset extracted in the **dataset_name** task for the cybersecurity paper titled "{title}".

        ### **Guidelines and Rules:**

        \t1. For each dataset, you will identify the following:
        \t- **availability**: Whether the dataset is 'public', 'proprietary', 'restricted', 'Custom-created datasets, not shared', or 'Custom-created datasets but public'.
        \t\t- **Public** are freely available for download (e.g., datasets hosted on websites like Kaggle, GitHub, or institutional repositories. 
        These datasets existed before the research and were not curated specifically by the authors.
        - Example: in the paper "Black-box Adversarial Attacks on Commercial Speech Platforms 
        with Minimal Information", they used are publicaly available datasets. The output should look like this:
        Json Output:

         {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Common Voice",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
                "dataset_name": "Song",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
                "dataset_name": "LibriSpeech",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
            }},
                "dataset_name": "Voxceleb dataset",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": " real-world"
            }}
        ]
    }} 

         - Example: in the paper "AHEAD: Adaptive Hierarchical Decomposition for Range Query under Local Differential Privacy", they used are usually publicaly available datasets. The output should look like this:

         Json Output:

         {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Salaries",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
                "dataset_name": "blackfriday",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
                "dataset_name": "Loan",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
            }},
                "dataset_name": "Financial",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
            }}
        ]
    }} 
         
        
        \t\t- **Proprietary** are those owned outright by an organization and are typically not accessible to the public, there is no route given to access them. 
        For instance, many internal company records, commercial databases, or market research datasets are considered proprietary because the owning entity.
        \t\t- **Restricted** are accessible only under specific conditions (e.g., requiring permission, collaboration, DUA, or  licensing). 
        - Example 1: The HCUP dataset available at [https://hcup-us.ahrq.gov/tech_assist/centdist.jsp] is classified as a restricted dataset. 
        Although the data is derived from real-world healthcare information, access is granted only under specific conditions, such as requiring permission
        through a data use agreement, ensuring that the sensitive nature of the data is properly managed.
        \t\t- **Custom-created datasets, not shared** are generated specifically for the research project and are not shared publicly. For example the custom created dataset in the paper **(Un)informed Consent: Studying GDPR Consent Notices in the Field** is not shared so return it as **Custom-created datasets, not shared**.
        \t\t- **Custom-created datasets but public** are custom datasets created by the authors but shared publicly.
        \t\t- **Custom-created datasets, but restricted** are Custom-Restricted datasets created by authors but shared with access restrictions.
         -Note: Sometimes, authors who create custom datasets may explicitly mention in their paper or dataset documentation 
        (URL/DOI or citation details) that, due to the large size of the dataset, they are unable to share it online but can provide access upon request
        (don't make random guess). For example, in the paper "Towards Precise Reporting of Cryptographic Misuses", the authors mentioned in their GitHub link: 
        'Our original datasets consist of a data set of **3,489 open-source Android apps obtained from F-Droid**, and a data set of **1,437 firmwares** 
        collected from 6 vendors. Due to the large size of the two datasets (APK dataset: 49 GB, firmware dataset: 21 GB), it is difficult to share them online. 
        If you are interested in obtaining the original **F-Droid**dataset and **firmware** dataset, please contact us.'. 
        
        -Json output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "3489 open-source Android apps",
                "availability": "Custom-created datasets, but restricted",
                "labeling_type": "labeled",
                "dataset_type": "realistic"
            }},
                "dataset_name": "1,437 firmware dataset",
                "availability": "Custom-created datasets, but restricted",
                "labeling_type": not mentioned",
                "dataset_type": "realistic"
            }},
        ]
    }}
        
        - Example 1: if authors download data (e.g., APK files or malware samples) from platforms like VirusTotal, then apply filtering, labeling, or feature extraction to create a tailored dataset, the resulting dataset is custom-created. While the original source (e.g. VirusTotal) can be cited, the curated dataset is distinct from the original collection and should be classified as 'Custom-created datasets, not shared' or 'Custom-created datasets but public', depending on whether the authors shared it publicly. Like in the paper **EIGER: Automated IOC Generation for Accurate and Interpretable Endpoint Malware Detection** they have collected 162K Malware Samples from VirusTotal and Benign public sources of free Windows software but didnt shared their dataset so return it as **Custom-created datasets, not shared**. Identify this correctly, the output should look like this:
        -Json Output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "162K Malware Samples from VirusTotal",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "realistic"
            }},
            {{
                "dataset_name": "Benign public sources of free Windows software",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "realistic"
            }},
            {{
                "dataset_name": "Hybrid Analysis Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "realistic"
            }}
        ]
    }}
        
        - Example 2: in the paper "C3PO: Large-Scale Study of Covert Monitoring of C&C Servers via Over-Permissioned Protocol Infiltration" where they collected **200,000 malware samples** over 15 year, identify this dataset as **Custom-created datasets, not shared**, since author(s) didnt mentioned sharing this dataset with the community. Identify it correctly, the output should look like this:
        -Json Output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "200k Malware Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "realistic"
            }}
        ]
    }}
        
        - Example 3: in the paper "Deterrence of Intelligent DDoS via Multi-Hop Traffic Divergence", the author's collected **49.8 TB real dataset from a department at Tsinghua campus network**, identify this and return it as **Custom-created datasets, not shared**. Identify it correctly, The output should look like this:.
        
        -Json Output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Tsinghua Network Traffic Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "Not Mentioned",
                "dataset_type": "realistic
                
                "
            }}
        ]
    }}
        - Example 4: In the paper "High Fidelity Data Reduction for Big Data Security Dependency Analyses", the dataset was collected from a real enterprise environment for one month, which makes it a custom-created dataset. However, the authors didn't mention sharing it, so return it as **Custom-created datasets, not shared**, 
        and the **labeling_type** wasn't mentioned either, so return it as **Not Mentioned**. Identify it correctly, The output should look like this:
        
        -Json Output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Enterprise Security Dependency Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "Not Mentioned",
                "dataset_type": "real-world"
            }},
        ]
    }}
        
        - Example 5: in the paper "This Sneaky Piggy Went to the Android Ad Market: Misusing Mobile Sensors for Stealthy Data Exfiltration" the datasets used are collected from **4.5K of the most popular apps**, **Two typing datasets** and **one typing datasets** all are **Custom-created datasets, not shared**..
         -Json Output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "4.5K Popular Apps Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
                "dataset_name": "Two Typing Datasets",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
            }},
                "dataset_name": "One Typing Dataset,
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
            }},
        ]
    }}
        -  Example 6: in the paper "BAPM: Block Attention Profiling Model for Multi-tab Website Fingerprinting Attacks on Tor" has created and used following datasets, The output should look like this:
        - Json Ouput:
        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Close World Multi-Tab Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
            }},
                "dataset_name": "Open World Multi-Tab Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
            }},
                "dataset_name": "Three-Tab Dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
            }},
            }},
                "dataset_name": "real world dataset",
                "availability": "Custom-created datasets but public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }}
        ]
    }}
    
         - Example 7: in the paper "PDiff: Semantic-based Patch Presence Testing for Downstream Kernels" the datasets used are both customer-created but one is **Custom-created datasets, not shared** and another is **Custom-created datasets but public**. The output should look like this:
         
         -Json Output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "CVE dataset",
                "availability": "Custom-created datasets, not shared",
                "labeling_type": "Not mentioned",
                "dataset_type": "real-world"
            }},
                "dataset_name": "Kernel Image dataset,
                "availability": "Custom-created datasets but public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
        ]
    }}
      
        \t-.**labeling_type**: Determine the labeling status of the dataset.
        \t\t- **labeled**:  A dataset is considered labeled if the paper or the dataset’s official documentation (accessed via DOI, URL, or citation details) explicitly 
        states that data points have labels or categories.  
        \t\t\t- Example conditions:
        - The paper says “We manually labeled the dataset.”
        - The dataset’s website or documentation includes label files or describes classes/categories for each data point.
        - If labeling is confirmed from external sources (DOI/URL/citation), specify how it was identified, e.g., "labeled (via citation details)" 
        or "labeled (found via URL)".
        - For image datasets, if classes, annotations, or bounding boxes are mentioned, consider them as labeled.
        \t\t- **unlabeled**: A dataset is considered unlabeled only if the paper or dataset documentation **explicitly states** that it has no labels or is unlabeled.  
        \t\t\t- For example, if the paper says, “The dataset is completely unlabeled,” or “We have no ground-truth labels,” then mark it as **unlabeled**.
        \t\t\t- If discovered via an external source (DOI/URL/citation) that explicitly says it’s unlabeled, note that as "unlabeled (found via URL)" or similar.
        \t\t-**hybrid**: A dataset is considered hybrid if it explicitly contains both labeled and unlabeled data.  
        \t\t\t- For instance, if the paper says, “The dataset includes 10,000 labeled samples and 100,000 unlabeled samples,” return **hybrid**.
        \t\t\t- If the dataset’s documentation (DOI/URL/citation) mentions both labeled and unlabeled subsets, also mark it as **hybrid**.
        \t\t- **re-labeled**: If the paper explicitly states that they took an existing dataset and re-annotated or re-labeled it for their study, return **re-labeled**.
        \t\t\t- For example, if it says, “We took the UNSW-NB15 dataset and re-labeled the events according to our criteria,” return **re-labeled**.
        \t\t- **not mentioned**: If after thoroughly checking the paper, its references, and any accessible DOI/URL information, you cannot find any mention of labeling 
        status (no explicit mention of labeling, unlabeled status, hybrid, or re-labeling), return **not mentioned**.  
        \t\t\t- Use **not mentioned** if:
        - The paper never states anything about labeling.
        - The dataset’s official sources (DOI/URL) do not mention labeling.
        - No external citation details clarify the labeling status.

        -**Consistent Example Using HDFS**:
        Suppose the paper mentions the HDFS dataset and references [38,39] for its original introduction:
        \t\t\t- The paper itself doesn’t state whether HDFS is labeled or unlabeled.  
        \t\t\t- The instructions say you can use citation details (i.e., papers [38,39]) to learn about the dataset’s labeling. 
        \t\t\t- After checking the referenced papers (assuming you have "web access" through the citation details-i.e., you can infer what the source papers are known for):
        - If the HDFS dataset source paper (Xu et al., SOSP ’09) mentions that the dataset consists of system logs classified by event types or that it is 
        commonly known that HDFS data is often annotated with specific event types, you can conclude it is **labeled** (e.g., "labeled (via citation details)"
        if found in the referenced paper).
        - If the source says explicitly it’s unlabeled logs (just raw logs without event types) and you confirm it from citation details, return 
        **unlabeled (via citation details)**.
        - If you find both labeled and unlabeled samples mentioned in the original dataset source, return **hybrid**.
        - If the paper or the reference does not clarify labeling at all and the dataset’s official documentation (if available) is not accessible, return **not mentioned**.
        - If the paper says “We re-labeled the HDFS dataset to fit our classification scheme,” return **re-labeled**.
        \t- **dataset_type**: Determine the type of dataset.
        \t\t- **Real-world**: The dataset is directly collected from a real-world system or environment without significant preprocessing. Examples include raw network traffic logs or unaltered user interaction data or a complete packet capture (PCAP) file from a corporate network during a normal workday.
        \t\t- **realistic**: Data simulating real-world scenarios, but collected in controlled or lab environments to mimic actual conditions. This may involve preprocessing or specific configurations. Example: A network traffic dataset collected from real systems but heavily anonymized or filtered for privacy or anonymized DNS logs or cleaned financial transaction data.
        \t\t- **synthetic**: The dataset is completely generated using simulations, models, or algorithms without 
        any direct data from real-world systems. Examples include simulated attack traffic or algorithmically generated 
        synthetic images, such as the **SYMTCP**.
        \t\t\t- Example: Datasets generated through symbolic execution (e.g., in the SYMTCP project) 
        are considered **synthetic** used in the paper "SYMTCP: Eluding Stateful Deep Packet Inspection 
        with Automated Discrepancy Discovery", The output should look like this:
        - Json Output:
        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "SYMTCP dataset",
                "availability": "Custom-created datasets but public",
                "labeling_type":"labeled",
                "dataset_type": "synthetic"
            }},
        ]
    }}
        \t\t\t- In the paper "Preparing Network Intrusion Detection Deep Learning Models with Minimal Data Using Adversarial Domain Adaptation", they have used two benchmark datasets; one is **hybrid** and another is **synthetic**.
        - Another json Example:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "UNSW-NB15",
                "availability": "public",
                "labeling_type":"labeled",
                "dataset_type": "hybrid"
            }},
            {{
                "dataset_name": "NSL-KDD",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
            }},
        ]
    }}
            
        
        
        \t- **hybrid**:  The dataset combines both real-world and synthetic elements. For example, "UNSW-NB15" dataset which is considered a "hybrid dataset", encompassing both real-world and synthetic elements.
        \t\t- For example: in the paper "Filtering DDoS Attacks from Unlabeled Network Traffic Data Using Online Deep Learning", they have used two datasets "CICIDS2017" and "CAIDA2007". Be vigilant with "CICIDS2017" dataset whenever you found it in any paper make sure to return it's **dataset_type** as **realistic**. The output should look like this:
        -Json Output:

        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "CICIDS2017",
                "availability": "public",
                "labeling_type":"labeled",
                "dataset_type": "realistic"
            }},
            {{
                "dataset_name": "CAIDA UCSD DDoS Attack 2007",
                "availability": "public",
                "labeling_type": "unlabeled",
                "dataset_type": "realistic"
            }},
        ]
    }}
        
        \t2. Ensure that the dataset names match the ones extracted from the **dataset_name** task.
        \t3. If no dataset is found in the **dataset_name** task, leave **dataset_analysis_combined** task 'null'. Be vigilant.
        
        **Null cases Examples**
        Example 1 (using the same example used in **dataset_name** task): In the paper "ZKCPlus: Optimized Fair-exchange Protocol Supporting Practical and Flexible Data Exchange", which mean the haven't used any dataset, so return the output as 'null'. And other tasks like **dataset_analysis_combined** , **dataset_categories** and **dataset_usage** will be 'null' too.
        
        - Json Output:

        {{

        "dataset_analysis_combined": null
    }}

        Example 2 (using the same example used in **dataset_name** task): The paper "Indistinguishability Prevents Scheduler Side Channels in Real-Time Systems" is not a dataset-related paper, which mean the haven't used any dataset, so return the output as 'null'. And other tasks like **dataset_analysis_combined** , **dataset_categories** and **dataset_usage** will be 'null' too.

        - Json Output:

        {{

        "dataset_analysis_combined": null
    }}
    
        Here are the datasets extracted earlier:
        {dataset_names}

        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your output must be returned only in valid JSON format using the following structure:
         
         {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Exact dataset name from the **dataset_name** task",
                "availability": "Extracted availability status (**public**, **proprietary**, **restricted**, **Custom-created datasets, not shared**, **Custom-created datasets but public**, or **Custom-created datasets, but restricted**)",
                "labeling_type": "Extracted labeling type (**labeled**, **unlabeled**, **hybrid**, **Re-labeled**, or **not-mentioned**)",
                "dataset_type": "Extracted dataset type (**real-world**,**realistic**, **synthetic**, **hybrid**)"
            }}
        ]
    }}

    Output Examples:

    **CAREFULLY CHECK THESE PAPER BEFORE RETURNING BACK THE OUTPUT**
    

        - Example 1: For the paper "Differentially Private Publishing of High-dimensional Data".
        -Json Output:

         {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Netflix",
                "availability": "public",
                "labeling_type":"labeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Transaction",
                "availability": "public",
                "labeling_type":"labeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Movielens",
                "availability": "public",
                "labeling_type":"labeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Document",
                "availability": "public",
                "labeling_type":"unlabeled (found via URL)",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "AOL",
                "availability": "public",
                "labeling_type":"unlabeled (found via URL)",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Kosarak",
                "availability": "public",
                "labeling_type": "unlabeled (found via URL)",
                "dataset_type": "real-world"
            }},
          ]
        }}
        - Example 2: For the paper "Recompose Event Sequences vs. Predict Next Events: 
        A Novel Anomaly Detection Approach for Discrete Event Logs", be aware with **HDFS dataset**.
        -Json Output:

             {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "UNSW-NB15",
                "availability": "public",
                "labeling_type":"labeled",
                "dataset_type": "hybrid"
            }},
            {{
                "dataset_name": "HDFS dataset",
                "availability": "public",
                "labeling_type": "labeled (via citation details)",
                "dataset_type": "realistic"
            }},
           ]
         }}
            - Example 3: "Model Extraction Attacks on Graph Neural Networks: Taxonomy and Realisation".
            - Json Output:

             {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Cora",
                "availability": "public",
                "labeling_type":"labeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Pubmed",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Citeseer",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "real-world"
            }}
           ]
        }}
        - Example 4: For the paper "Continuous Release of Data Streams under both Centralized and Local Differential Privacy", the output should look like:
        -Json Output:
             {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "DNS",
                "availability": "public",
                "labeling_type":"unlabeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Fare",
                "availability": "restricted",
                "labeling_type":"unlabeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "Kosarak",
                "availability": "public",
                "labeling_type":"unlabeled",
                "dataset_type": "real-world"
            }},
            {{
                "dataset_name": "POS",
                "availability": "public",
                "labeling_type": "unlabeled",
                "dataset_type": "real-world"
            }},
           ]
        }}

        - Example 5: As an AI assitant, you have web access so if a dataset is cited is from another work, note its title, contributors, and source publication details for other tasks like **labeling_type**, **availability** or **dataset_type** to extract details by web searching from these citations. For Example:
        - "Vassil Panayotov, Guoguo Chen, Daniel Povey, and Sanjeev Khudanpur. 2015. Librispeech: an ASR corpus based on public domain audiobooks. In Proc. of ICASSP."
        - Example: in the paper "MineSweeper: An In-depth Look into Drive-by Cryptocurrency Mining and Its Defense" the author custom-created only one dataset and the stated "We ran 50 Docker containers in parallel for one week mid-March 2018 to collect data from Alexa’s Top 1 Million websites (as of February 28, 2018)". 
        Also the shared this dataset with the community **https://github.com/vusec/minesweeper**. The output should look like this:
        
        Json Output:
         {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "collected dataset(minesweeper)",
                "availability": "Custom-created datasets but public",
                "labeling_type": "labeled(via URL)",
                "dataset_type": "real-world"
            }},
        ]
    }}
        - Example (c) in the paper "Secure Multi-party Computation of Differentially Private Heavy Hitters", two datasets are used, the output should look like this:
        Json Output:
        
        {{
        "dataset_analysis_combined": [
            {{
                "dataset_name": "Zipf distribution",
                "availability": "public",
                "labeling_type": "labeled",
                "dataset_type": "synthetic"
        
            }},
                "dataset_name": "Online retail dataset",
                "availability": "public",
                "labeling_type": "unlabeled",
                "dataset_type": "real-world"
            }}
           ]
        }} 

         Your response: """

    
    elif task == "dataset_categories":
        
        return f"""

        You are tasked with identifying the specific categories and subcategories of datasets extracted from the **dataset_name** task used in the cybersecurity paper titled "{title}".

        Clarification:
        The **dataset_categories** refers specifically to **what the dataset consists of** or contains, not how it is used in the research. 
        Focus on the dataset's inherent characteristics and contents.
        Note: These categories are derived from the taxonomy outlined in the USENIX paper "Cybersecurity Research Datasets: Taxonomy and Empirical Analysis" by 
        Zheng et al., which provides a structured framework for categorizing cybersecurity datasets. Additionally, a new category for multimedia 
        data has been added based on evolving research needs.

        Guidelines and Rules:

        \t1. By **dataset_categories**, we mean identifying whether a dataset belongs to the following major categories and their subcategories:

        **Major Categories and Subcategories**:

        \t\t- **Attacker-Related**:
        \t\t  1. **Attacks**: Datasets containing information about malicious actions performed to harm systems (e.g., CICIDS2017, Kitsune etc).
        \t\t  2. **Vulnerabilities**: Datasets capturing weaknesses in systems or software that attackers can exploit (e.g., CVE databases or Open Source 
        Vulnerability Database as a dataset).
        \t\t  3. **Exploits**: Technical methods or tools used to execute attacks, such as exploit scripts or frameworks.
        \t\t  4. **Cybercrime Infrastructures**: Datasets capturing illegal operations and tools, such as botnets, marketplaces, or malware delivery.
        \t\t  5. **Malware**: is a curated collection of data samples that contain malicious software (malware) or artifacts derived from it.
        Raw binaries or executables (e.g., .exe, .apk, .elf files), or Network traffic generated by malware (PCAP files, DNS queries, C2 communications) or etc.

        \t\t- **Defender Artifacts**:
        \t\t  1. **Alerts**: Logs or outputs from defensive systems like intrusion detection systems or firewalls.
        \t\t  2. **Configurations**: Information on setup and configurations of defense systems (e.g., SSL certificate settings).

        \t\t- **User & Organizational Characteristics**:
        \t\t  1. **User Activities**: Data on the behavior of users or organizations (e.g., social media activity, browsing logs).
        \t\t  2. **User Attitudes**: Survey data capturing opinions or sentiments on cybersecurity topics.
        \t\t  3. **User Attributes**: Characteristics of users or organizations (e.g., demographic profiles or organizational metadata).

        \t\t- **Macro-Level Internet Characteristics**:
        \t\t  1. **Applications**: Data on Internet services or products (e.g., website rankings, mobile apps).
        \t\t  2. **Network Traces**: Packet-level traffic data or network activity logs.
        \t\t  3. **Topology**: Information on the structure of the Internet, such as AS relationships or routing paths.
        \t\t  4. **Benchmarks**: contain information about measurements of Internet performance, such as upload/download speed or end-to-end 
        network reliability. For example in the paper "Tackling bufferbloat in 3G/4G networks"  Jiang and Wang constructed a dataset that measured 
        3G/4G network performance in the US and Korea.
        \t\t  5. **Adverse Events**: Data on disruptions or outages, like failures caused by misconfigurations.

        \t\t- **Visual and Multimedia Data** (New Category):
        \t\t  1. **Image Datasets**: Datasets containing static visual data for tasks like classification, recognition, or detection (e.g., CIFAR-10, MNIST).
        \t\t  2. **Video Datasets**: Datasets containing dynamic visual data for tasks like motion tracking or behavior analysis (e.g., UCF101, Kinetics).
        \t\t  3. **Audio Datasets**: Datasets containing audio data (e.g., SpeechCommand, LibriSpeech)..
        \t\t  4. **Multimodal Datasets**:  Datasets combining different types of data (e.g., images and text or audio and visual) for tasks like cross-modal 
        retrieval (e.g., Voxceleb).
        \t\t  5. **Synthetic Media Datasets**: Artificially generated, any form of media datasets.
        \t\t\t - For Example: **Typing Motion Dataset (Two-Handed & One-Handed Typing)** used in the paper 'This Sneaky Piggy Went to the Android Ad Market:
        Misusing Mobile Sensors for Stealthy Data Exfiltration'  which is artificially generated and falls under this sub-category.

        \t\t- **Others-catchall** (New Category):
        \t\t 1.**others**: If no **dataset** fall under above given category return **others**, that's are new catch-all
        category.
    

        \t2. **Examples for Clarity**:
        \t\t- A dataset like **Netflix Ratings** used in privacy studies should be categorized under **User & Organizational Characteristics** -> **User Activities**.
        \t\t- A dataset like **CAIDA AS Relationships**, which captures Internet topology data, should be categorized under **Macro-Level Internet Characteristics** -> **Topology**.
        \t\t- A dataset like **CIFAR-10**, used for image classification, should be categorized under **Visual and Multimedia Data** -> **Image Datasets**.

        \t3. **Do Not Confuse with Domain**:
        \t\t- **Domain** refers to the high-level research area (e.g., IoT, malware analysis).
        \t\t- **Dataset Categories** focus exclusively on the dataset's inherent characteristics (e.g., attacks, vulnerabilities, defender artifacts).

        \t5. **Null Cases**:
        \t\t. If no dataset found in **dataset_name** task, leave **dataset_categories** as null.

        Here are the datasets extracted earlier:
        
        {dataset_names}

        ### Output Structure:

        The output must strictly follow this JSON structure:
              
      {{
      
      "dataset_categories": [
            {{
                "dataset_name": "Kitsune",
                "category": "attacker_related",
                "subcategory": "attacks",
                "attacker_related_items":[
                    {{"name": "Fuzzing"}},
                    {{"name": "ARP MitM"}},
                    {{"name": "SSDP Flood"}},
                    {{"name": "SYN DoS"}},
                    {{"name": "Mirai Botnet"}}
                    ]
                }},
                
                {{
                "dataset_name": "CIFAR-10",
                "category": "visual_and_multimedia_data",
                "subcategory": "image_datasets",
                "visual_data_items": [
                    {{"name": "Image Classification"}}
                    ]
                }}
              ]
           }}

    ### Output Example:
    - For the **Kitsune** dataset, which contains nine attacks such as:
        1. OS Scan
        2. Fuzzing
        3. Video Injection
        4. ARP MitM
        5. Active Wiretap
        6. SSDP Flood
        7. SYN DoS
        8. SSL Renegotiation
        9. Mirai Botnet

      If the paper only utilizes attacks 2, 4, 6, 7, and 9, the output should list only those attacks.
      
      <Start of Paper Content>
      {content}
      <End of Paper Content>
      
      Your response: """

        
    
    
    else:
        raise ValueError("Invalid task")

Runner(Processing) Script

In [None]:
import os
import json
import openai

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

# Incremental saving
def save_incremental_results(results, output_file="results_incremental.jsonl"):
    with open(output_file, "a", encoding="utf-8") as file:
        for paper_title, result in results.items():
            file.write(json.dumps({paper_title: result}) + "\n")

# Load previously saved results
def load_saved_results(output_file="results_incremental.jsonl"):
    saved_titles = set()
    saved_results = {}
    try:
        with open(output_file, "r", encoding="utf-8") as file:
            for line in file:
                result = json.loads(line.strip())
                for title, data in result.items():
                    saved_titles.add(title)
                    saved_results[title] = data
    except FileNotFoundError:
        print(f"No saved results found in {output_file}. Starting fresh.")
    return saved_titles, saved_results

# Function to process papers for multiple tasks
def process_papers_for_tasks(papers, tasks, start_index=0, output_file="results_incremental.jsonl"):
    task_results = {}
    for i, paper in enumerate(papers, start=start_index):
        paper_title = paper['title']
        print(f"Processing paper {i + 1}/{len(papers) + start_index}: {paper_title}")

        # Skip already processed papers
        if paper_title in processed_titles:
            print(f"Skipping already processed paper: {paper_title}")
            continue

        task_results[paper_title] = {}

        # Process each task for the paper
        for task in tasks:
            user_prompt = generate_system_prompt(paper, task, task_results[paper_title].get("dataset_name", None))
            try:
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {"role": "user", "content": user_prompt}
                    ],
                    temperature=0.2,
                    max_tokens=5000
                )
                response_text = response.choices[0].message.content
                print(f"Result for {task}: {response_text}")
                task_results[paper_title][task] = response_text
            except Exception as e:
                print(f"Error processing {task} for paper {i + 1}: {e}")
                task_results[paper_title][task] = f"error: {str(e)}"

        # Save incremental results
        save_incremental_results({paper_title: task_results[paper_title]}, output_file)

    return task_results


# Load saved results
output_file = "results_incremental.jsonl"
processed_titles, processed_results = load_saved_results(output_file)

# Remaining papers
remaining_papers = [paper for paper in test_papers if paper['title'] not in processed_titles]

# Define the tasks you are going to process
tasks = [
    "title", 
    "authors_name", 
    "conference_name", 
    "published_year", 
    "school_institution", 
    "domain", 
    "dataset_name", 
    "dataset_analysis_combined", 
    "dataset_categories"
]

# Resume processing
all_results = process_papers_for_tasks(remaining_papers, tasks, start_index=len(processed_titles), output_file=output_file)

# Combine with saved results
processed_results.update(all_results)

# Save final results to a new file
output_csv = "results_merged_full_final.csv"
output_jsonl = "results_merged_full_final.jsonl"

# Save to CSV
import csv
with open(output_csv, "w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(tasks)  # Header row
    for title, results in processed_results.items():
        row = []
        for task in tasks:
            task_result = results.get(task, "No result")
            if isinstance(task_result, (dict, list)):
                task_result = json.dumps(task_result)  # Convert to string
            row.append(task_result)
        writer.writerow(row)

# Save to JSONL
with open(output_jsonl, "w", encoding="utf-8") as jsonl_file:
    for title, results in processed_results.items():
        jsonl_file.write(json.dumps({title: results}) + "\n")

print(f"Final results saved to {output_csv} and {output_jsonl}.")


DATA ANALYSIS AND VISUALIZATION

In [None]:
import csv
import json
import ast
import re
import chardet
import matplotlib.pyplot as plt

# 1) Detect CSV encoding
file_path = "results_merged_full_final.csv"
with open(file_path, "rb") as f:
    enc = chardet.detect(f.read(100000))["encoding"]
print(f"Detected Encoding: {enc}")

# 2) Prepare a small parser that strips ```json``` fences then tries json.loads or ast.literal_eval
_fence_re = re.compile(r"```(?:json)?\s*|\s*```")

def parse_record(raw: str) -> dict:
    """Strip out any ```json``` fences, then parse into a dict (JSON or Python literal)."""
    if not raw:
        return {}
    s = _fence_re.sub("", raw).strip()
    # try real JSON
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            return obj
    except json.JSONDecodeError:
        pass
    # fallback to Python literal
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    return {}

# 3) Count how many have a non‐empty `datasets` list
dataset_related_count = 0
non_dataset_related_count = 0

with open(file_path, "r", encoding=enc, errors="replace") as f:
    reader = csv.DictReader(f)
    for row in reader:
        raw_ds = row.get("dataset_name", "")
        rec = parse_record(raw_ds)
        ds_list = rec.get("datasets")
        if isinstance(ds_list, list) and len(ds_list) > 0:
            dataset_related_count += 1
        else:
            non_dataset_related_count += 1

# 4) Calculate and print percentages
total = dataset_related_count + non_dataset_related_count
print(f"Dataset-Related Papers: {dataset_related_count} ({dataset_related_count/total*100:.1f}%)")
print(f"Non-Dataset-Related Papers: {non_dataset_related_count} ({non_dataset_related_count/total*100:.1f}%)")

# 5) Plot pie chart
labels = ["Dataset-Related", "Non-Dataset-Related"]
sizes  = [dataset_related_count, non_dataset_related_count]
colors = ["#4CAF50", "#FF5733"]

plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90, colors=colors)
plt.title("Distribution of Dataset-Related vs Non-Dataset-Related Papers")
plt.axis("equal")
plt.show()


In [None]:
!pip install chardet

In [None]:
import csv
import json
import ast
from collections import Counter
import matplotlib.pyplot as plt
import chardet

file_path = "results_merged_full_final.csv"
with open(file_path, "rb") as raw:
    detected = chardet.detect(raw.read(100000))
encoding_used = detected["encoding"]
print(f"Detected Encoding: {encoding_used}")

#  2) helper to strip fences and backticks
def strip_fences(s: str) -> str:
    """Remove ```json ...``` or ``` ... ``` fences if present."""
    if not s:
        return s
    lines = s.splitlines()
    # drop any lines that start or end with backticks
    inner = [l for l in lines 
             if not (l.strip().startswith("```") and l.strip().endswith("```")) 
             and not l.strip().startswith("```json")]
    return "\n".join(inner).strip()

#  3) helper to parse the field 
def parse_conference_field(raw: str) -> str:
    """
    Attempt json.loads, then ast.literal_eval, then fallback to raw.
    Returns the conference code string.
    """
    if raw is None:
        return ""
    s = raw.strip()
    s = strip_fences(s)

    # 1) try JSON
    try:
        obj = json.loads(s)
        if isinstance(obj, dict) and "conference" in obj:
            return str(obj["conference"]).strip()
    except json.JSONDecodeError:
        pass

    # 2) try Python literal eval (single-quoted dict)
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, dict) and "conference" in obj:
            return str(obj["conference"]).strip()
    except Exception:
        pass

    # 3) bare value
    return s.strip("'\" ")

# 4) read CSV and count 
conference_counter = Counter()
total_rows = 0

with open(file_path, "r", encoding=encoding_used, errors="replace") as cf:
    reader = csv.DictReader(cf)
    for row in reader:
        total_rows += 1
        raw = row.get("conference_name", "")
        conf = parse_conference_field(raw)
        if conf:
            # your mapping
            mapping = {
                "USS": "USENIX", "USENIX ATC": "USENIX", "ATC": "USENIX",
                "FAST":"USENIX", "NSDI":"USENIX", "OSDI":"USENIX",
                "NDSS":"NDSS", "MADWeb":"NDSS",
                "ACSAC":"ACSAC","AISCC":"ACSAC",
                "TPDS":"SP","TASLP":"SP", 
                "CCS":"CCS","ASIA CCS":"CCS"
            }
            conf_group = mapping.get(conf, conf)
            conference_counter[conf_group] += 1
        else:
            conference_counter["SP"] += 1

# 5) report & plot 
print(f"\nProcessed rows: {total_rows}")
print("Papers per consolidated conference:")
for conf, cnt in conference_counter.items():
    print(f"  {conf:10s}: {cnt}")

# visualize
labels = list(conference_counter.keys())
counts = list(conference_counter.values())

plt.figure(figsize=(10,6))
plt.bar(labels, counts)
plt.xticks(rotation=45, ha="right")
plt.title("Total Papers per Consolidated Conference")
plt.tight_layout()
plt.show()


In [None]:
import csv
import json
import ast
import chardet
import matplotlib.pyplot as plt
from collections import Counter

file_path = "results_merged_full_final.csv"

# 1) Detect encoding
with open(file_path, "rb") as f:
    enc = chardet.detect(f.read(100000))["encoding"]
print("Detected Encoding:", enc)

# 2) Helper to strip any ```json``` fences and backticks, then parse
def clean_and_parse(raw):
    """
    Strip out any ```json``` or ``` backticks, then try json.loads.
    If that fails, try ast.literal_eval. Return dict or empty dict.
    """
    if not raw:
        return {}
    # remove any backtick fences
    s = raw.replace("```json", "").replace("```", "").strip()
    # try JSON first
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            return obj
    except json.JSONDecodeError:
        pass
    # fallback to Python literal
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, dict):
            return obj
    except Exception:
        pass
    return {}

# 3) Count dataset‐related papers per year
yearly_dataset_papers = Counter()

with open(file_path, "r", encoding=enc, errors="replace") as f:
    reader = csv.DictReader(f)
    for row in reader:
        # parse year
        ydict = clean_and_parse(row.get("published_year", ""))
        year = ydict.get("year")
        # normalize to int
        if isinstance(year, str) and year.isdigit():
            year = int(year)
        elif not isinstance(year, int):
            continue  # skip if we can't get a valid year

        # parse datasets
        ddict = clean_and_parse(row.get("dataset_name", ""))
        datasets = ddict.get("datasets")
        if isinstance(datasets, list) and datasets:
            yearly_dataset_papers[year] += 1

# 4) Prepare & plot
years  = sorted(yearly_dataset_papers)
counts = [yearly_dataset_papers[y] for y in years]

plt.figure(figsize=(12,6))
plt.plot(years, counts, marker="o", linestyle="-")
plt.xlabel("Year")
plt.ylabel("Number of Dataset-Related Papers")
plt.title("Temporal Trends in Dataset Usage (2015-2024)")
plt.xticks(range(min(years), max(years)+1), rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# 5) Print the final counts
print("\nYearly Dataset Paper Counts:")
for y in years:
    print(f"{y}: {yearly_dataset_papers[y]} papers")


Dataset Summary File (paper-dataset instances)

In [None]:
import csv, json, re, ast

INPUT = "results_merged_full_final.csv"
OUTPUT = "datasets_summary_updated_final_new.csv"

#  helpers
def between_backticks(s: str) -> str:
    """Return content between triple backticks if present; else original."""
    if not isinstance(s, str):
        return ""
    m = re.search(r"```(?:json)?\s*(.+?)\s*```", s, flags=re.S)
    return m.group(1) if m else s

def parse_any(cell):
    """Parse JSON or Python-literal after stripping backticks."""
    if cell is None:
        return None
    s = between_backticks(str(cell)).strip()
    # try strict JSON
    try:
        return json.loads(s)
    except Exception:
        pass
    # try Python literal (single quotes, etc.)
    try:
        return ast.literal_eval(s)
    except Exception:
        return None

def get_title(cell):
    obj = parse_any(cell)
    if isinstance(obj, dict) and "title" in obj:
        return str(obj["title"]).strip()
    # fallback: return plain string without backticks
    raw = between_backticks(str(cell or "")).strip()
    return raw or "N/A"

def get_year(cell):
    obj = parse_any(cell)
    if isinstance(obj, dict) and "year" in obj:
        return str(obj["year"]).strip()
    return "N/A"

def get_conference(cell):
    obj = parse_any(cell)
    if isinstance(obj, dict) and "conference" in obj:
        return str(obj["conference"]).strip()
    return "N/A"

def norm_name(s: str) -> str:
    """Aggressive normalization for joining dataset names across columns."""
    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())

def ensure_list(x):
    """Coerce None/non-list to empty list."""
    return x if isinstance(x, list) else []

# main
def process_csv_to_summary(input_csv, output_csv):
    out_rows = []

    with open(input_csv, "r", encoding="utf-8", errors="replace", newline="") as f:
        reader = csv.DictReader(f)

        for row in reader:
            title = get_title(row.get("title"))
            year  = get_year(row.get("published_year"))          # correct column
            conf  = get_conference(row.get("conference_name"))   # correct column

            # dataset_name -> {"datasets":[...]} or null
            dn_obj   = parse_any(row.get("dataset_name"))
            datasets = ensure_list(dn_obj.get("datasets") if isinstance(dn_obj, dict) else None)

            # dataset_analysis_combined -> {"dataset_analysis_combined":[...]} or null
            dac_obj  = parse_any(row.get("dataset_analysis_combined"))
            dac_list = ensure_list(dac_obj.get("dataset_analysis_combined") if isinstance(dac_obj, dict) else None)

            # dataset_categories -> {"dataset_categories":[...]} or null
            cat_obj  = parse_any(row.get("dataset_categories"))
            cat_list = ensure_list(cat_obj.get("dataset_categories") if isinstance(cat_obj, dict) else None)

            # build lookups with normalized names
            dac_by = {norm_name(x.get("dataset_name")): x for x in dac_list if isinstance(x, dict)}
            cat_by = {}
            for x in cat_list:
                if not isinstance(x, dict):
                    continue
                cat_by[norm_name(x.get("dataset_name"))] = x

            for d in datasets:
                if not isinstance(d, dict):
                    continue
                raw_name = d.get("dataset_name", "N/A")
                key = norm_name(raw_name)

                availability = labeling_type = dataset_type = "N/A"
                category = subcategory = "N/A"

                if key in dac_by:
                    a = dac_by[key]
                    availability  = a.get("availability", "N/A")
                    labeling_type = a.get("labeling_type", "N/A")
                    dataset_type  = a.get("dataset_type", "N/A")

                if key in cat_by:
                    c = cat_by[key]
                    category    = c.get("category", "N/A")
                    subcategory = c.get("subcategory", "N/A")

                out_rows.append({
                    "title": title,
                    "dataset_name": raw_name,
                    "contributors": d.get("contributors", "N/A"),
                    "doi": d.get("doi", "N/A"),
                    "url": d.get("url", "N/A"),
                    "availability": availability,
                    "labeling_type": labeling_type,
                    "dataset_type": dataset_type,
                    "category": category,
                    "subcategory": subcategory,
                    "conference_name": conf,
                    "published_year": year,
                })

    fieldnames = [
        "title", "dataset_name", "contributors", "doi", "url",
        "availability", "labeling_type", "dataset_type",
        "category", "subcategory", "conference_name", "published_year"
    ]
    with open(output_csv, "w", encoding="utf-8", newline="") as out:
        w = csv.DictWriter(out, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(out_rows)

    print(f"Processed data saved to {output_csv}")

# Run
if __name__ == "__main__":
    process_csv_to_summary(INPUT, OUTPUT)


Frequently Used Datasets List

In [None]:
import csv
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Function to normalize dataset names
def normalize_dataset_name(name):
    if not name or name == "N/A":
        return None
    name = name.strip().lower().replace("-", "").replace("_", "")
    if "cifar10" in name or "cifar100" in name:
        return "cifar10 and cifar100"  # Group CIFAR-10 and CIFAR-100 as one
    if "enron" in name or "enron email dataset" in name:
        return "enron email dataset"
    return name

# Function to extract public datasets from CSV
def extract_public_datasets(file_path, output_csv):
    public_datasets = Counter()

    # Read the CSV file
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        reader = csv.DictReader(file)
        
        for row in reader:
            dataset_name = row.get("dataset_name", "").strip()
            availability = row.get("availability", "").strip().lower()

            if dataset_name and availability == "public":
                normalized_name = normalize_dataset_name(dataset_name)
                if normalized_name:
                    public_datasets[normalized_name] += 1

    # Write to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["dataset_name", "frequency"])
        for dataset_name, frequency in public_datasets.most_common(15):
            writer.writerow([dataset_name, frequency])

    print(f"Top public datasets saved to {output_csv}\n")
    
    # Print the top public datasets and their frequencies
    print("Top Public Datasets by Frequency:")
    for dataset_name, frequency in public_datasets.most_common(30):
        print(f"{dataset_name}: {frequency}")

    return public_datasets

# Function to plot the datasets safely
def plot_top_datasets(dataset_counts, output_png):
    # Get the top 30 datasets
    sorted_datasets = dataset_counts.most_common(35)

    # Prevent plotting empty data
    if not sorted_datasets:
        print("No public datasets found. Skipping plot generation.")
        return

    dataset_names = [item[0] for item in sorted_datasets]
    frequencies = [item[1] for item in sorted_datasets]

    # Set Seaborn style for better visuals
    sns.set_theme(style="whitegrid")
    plt.figure(figsize=(14, 8))

    # Create horizontal bar plot
    bars = sns.barplot(
        x=frequencies,
        y=dataset_names,
        palette="Blues_d",
        orient="h",
    )

    # Annotate frequencies on the bars
    for index, bar in enumerate(bars.patches):
        plt.text(
            bar.get_width() + 1,  # Position to the right of the bar
            bar.get_y() + bar.get_height() / 2,  # Centered vertically
            f"{frequencies[index]}",
            fontsize=10,
            va="center",
            ha="left",
        )

    # Add titles and labels
    plt.xlabel("Frequency", fontsize=12)
    plt.ylabel("Dataset Name", fontsize=12)
    plt.title("Top 30 Public Datasets by Frequency", fontsize=16, fontweight="bold")
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)

    # Adjust layout for better spacing
    plt.tight_layout()
    plt.savefig(output_png, dpi=300)
    plt.show()

# Main script
input_csv = "datasets_summary_updated_final_new.csv"
output_csv = "frequently_used_dataset_list.csv"
output_png = "top_public_datasets.png"

# Extract and plot datasets
public_datasets = extract_public_datasets(input_csv, output_csv)
plot_top_datasets(public_datasets, output_png)


Research Domains Trends-Descending Order(Top)

In [None]:
import pandas as pd
import json

# Load the CSV file
file_path = "results_merged_full_final.csv"

# Try utf-8 first, fallback to latin1 if needed
try:
    df = pd.read_csv(file_path, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding="latin1")

# Parse list of domain JSONs from each cell
def parse_domain_list(cell):
    if pd.isna(cell) or not isinstance(cell, str):
        return []
    
    try:
        cleaned = cell.strip("```json").strip("```").strip()
        domain_list = json.loads(cleaned)
        
        # Ensure it's a list of dicts
        if isinstance(domain_list, list):
            return [
                {
                    "high_level_domain": d.get("high_level_domain", "").strip(),
                    "subdomain": d.get("subdomain", "").strip()
                }
                for d in domain_list
                if d.get("high_level_domain") and d.get("subdomain")
            ]
        else:
            return []
    except Exception:
        return []

# Apply to the domain column
df["domain_parsed"] = df["domain"].apply(parse_domain_list)

# Explode the parsed domain list into separate rows
domain_df = df.explode("domain_parsed").dropna(subset=["domain_parsed"])

# Extract high_level_domain and subdomain into separate columns
domain_df["high_level_domain"] = domain_df["domain_parsed"].apply(lambda x: x["high_level_domain"])
domain_df["sub_domain"] = domain_df["domain_parsed"].apply(lambda x: x["subdomain"])

# Drop the helper column
domain_df = domain_df.drop(columns=["domain_parsed"])

# Count total occurrences of each high-level domain
high_level_counts = domain_df["high_level_domain"].value_counts().reset_index()
high_level_counts.columns = ["high_level_domain", "total_count"]

# Calculate percentage of each high-level domain
total_entries = high_level_counts["total_count"].sum()
high_level_counts["percentage"] = (high_level_counts["total_count"] / total_entries) * 100

# Count subdomain frequencies under each high-level domain
subdomain_counts = domain_df.groupby(["high_level_domain", "sub_domain"]).size().reset_index(name="count")

# Merge subdomain count with high-level totals
subdomain_counts = subdomain_counts.merge(high_level_counts[["high_level_domain", "total_count"]], on="high_level_domain")

# Calculate subdomain percentage under each domain
subdomain_counts["percentage"] = (subdomain_counts["count"] / subdomain_counts["total_count"]) * 100

# Save output
output_file = "research_domain_trend.csv"
subdomain_counts.to_csv(output_file, index=False)

# Display summary
print(f" Results saved to {output_file}\n")

print(" Summary of High-Level Domains:")
print(high_level_counts)

print("\n Top 20 Subdomain Breakdown:")
print(subdomain_counts.head(20))


Research subdomain and Subcategory

In [None]:
import pandas as pd
from collections import defaultdict
import json

def load_csv(file_path):
    """Load data from a CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path, encoding="latin1")

def parse_json_field(field):
    """Parse JSON-like fields, handling inconsistencies."""
    if pd.isna(field) or not isinstance(field, str):
        return None
    try:
        field_cleaned = field.strip("```json").strip("```").strip()
        return json.loads(field_cleaned)
    except json.JSONDecodeError:
        return None

def extract_subdomain_subcategory_counts(df):
    """Extract and count subcategories for each research subdomain along with the published year."""
    subdomain_subcategory_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for _, row in df.iterrows():
        # Extract and clean year
        year_data = parse_json_field(row.get("published_year", ""))
        year = year_data.get("year") if isinstance(year_data, dict) else None
        if isinstance(year, str) and year.isdigit():
            year = int(year)
        if not isinstance(year, int):
            continue

        # Extract subdomains from the domain column (which is a list of dicts)
        domain_data = parse_json_field(row.get("domain", ""))
        subdomains = []
        if isinstance(domain_data, list):
            for item in domain_data:
                if isinstance(item, dict):
                    sub = item.get("subdomain")
                    if sub and isinstance(sub, str):
                        subdomains.append(sub.strip().lower())

        # Extract dataset subcategories
        dataset_data = parse_json_field(row.get("dataset_categories", ""))
        dataset_entries = dataset_data.get("dataset_categories", []) if isinstance(dataset_data, dict) else []
        if not isinstance(dataset_entries, list):
            dataset_entries = []

        for subdomain in subdomains:
            for entry in dataset_entries:
                if not isinstance(entry, dict):
                    continue
                subcat = entry.get("subcategory", "Unknown")
                if not subcat or subcat in ["null", "nan"]:
                    continue
                subcat = str(subcat).strip().lower()
                subdomain_subcategory_counts[subdomain][year][subcat] += 1

    return subdomain_subcategory_counts

def generate_dataframe(subdomain_subcategory_counts):
    """Convert the nested dictionary to a pandas DataFrame."""
    rows = []
    for subdomain, years in subdomain_subcategory_counts.items():
        for year, subcategories in years.items():
            for subcat, count in subcategories.items():
                rows.append({
                    "Research_Subdomain": subdomain,
                    "Year": year,
                    "Subcategory": subcat,
                    "Count": count
                })

    df = pd.DataFrame(rows)

    # Get top 10 subdomains by total dataset usage
    top_subdomains = df.groupby("Research_Subdomain")["Count"].sum().nlargest(10).index
    df_filtered = df[df["Research_Subdomain"].isin(top_subdomains)]

    print("\n Top 10 Research Subdomain Trends Preview:")
    print(df_filtered.head())

    return df_filtered

# 🔹 Main Execution
file_path = "results_merged_full_final.csv"  #  Corrected input file
df = load_csv(file_path)

counts = extract_subdomain_subcategory_counts(df)
df_final = generate_dataframe(counts)

# Save results
output_csv = "top_10_subdomain_subcategory_trends_new.csv"
df_final.to_csv(output_csv, index=False)
print(f"\n Results saved to: {output_csv}")


Most Used Dataset Subcategory

In [None]:
import pandas as pd
from collections import defaultdict
import json

def load_csv(file_path):
    """Load data from a CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path, encoding="latin1")

def parse_json_field(field):
    """Parse JSON-like fields, handling inconsistencies."""
    if pd.isna(field) or not isinstance(field, str):
        return None
    try:
        field_cleaned = field.strip("```json").strip("```").strip()
        return json.loads(field_cleaned)
    except json.JSONDecodeError:
        return None

def extract_subdomain_subcategory_counts(df):
    """Extract and count subcategories for each research subdomain along with the published year."""
    subdomain_subcategory_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    for _, row in df.iterrows():
        # Extract and clean year
        year_data = parse_json_field(row.get("published_year", ""))
        year = year_data.get("year") if isinstance(year_data, dict) else None
        if isinstance(year, str) and year.isdigit():
            year = int(year)
        if not isinstance(year, int):
            continue

        # Extract subdomains from the domain column (which is a list of dicts)
        domain_data = parse_json_field(row.get("domain", ""))
        subdomains = []
        if isinstance(domain_data, list):
            for item in domain_data:
                if isinstance(item, dict):
                    sub = item.get("subdomain")
                    if sub and isinstance(sub, str):
                        subdomains.append(sub.strip().lower())

        # Extract dataset subcategories
        dataset_data = parse_json_field(row.get("dataset_categories", ""))
        dataset_entries = dataset_data.get("dataset_categories", []) if isinstance(dataset_data, dict) else []
        if not isinstance(dataset_entries, list):
            dataset_entries = []

        for subdomain in subdomains:
            for entry in dataset_entries:
                if not isinstance(entry, dict):
                    continue
                subcat = entry.get("subcategory", "Unknown")
                if not subcat or subcat in ["null", "nan"]:
                    continue
                subcat = str(subcat).strip().lower()
                subdomain_subcategory_counts[subdomain][year][subcat] += 1

    return subdomain_subcategory_counts

def generate_dataframe(subdomain_subcategory_counts):
    """Convert the nested dictionary to a pandas DataFrame."""
    rows = []
    for subdomain, years in subdomain_subcategory_counts.items():
        for year, subcategories in years.items():
            for subcat, count in subcategories.items():
                rows.append({
                    "Research_Subdomain": subdomain,
                    "Year": year,
                    "Subcategory": subcat,
                    "Count": count
                })

    df = pd.DataFrame(rows)

    # Get top 10 subdomains by total dataset usage
    top_subdomains = df.groupby("Research_Subdomain")["Count"].sum().nlargest(10).index
    df_filtered = df[df["Research_Subdomain"].isin(top_subdomains)]

    print("\n Top 10 Research Subdomain Trends Preview:")
    print(df_filtered.head())

    return df_filtered

#  Main Execution
file_path = "results_merged_full_final.csv"  #  Correct input file
df = load_csv(file_path)

counts = extract_subdomain_subcategory_counts(df)
df_final = generate_dataframe(counts)

# 🔹 Save the final results
output_csv = "top_10_subdomain_subcategory_trends.csv"  # Final output file
df_final.to_csv(output_csv, index=False)
print(f"\n Results saved to: {output_csv}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Normalize and map similar subcategory names
def normalize_subcategory(subcategory):
    mapping = {
        "user activities": "user_activities",
        "user_activities": "user_activities",
        "audit_reports": "audio_datasets",
        "audio_visual_datasets": "audio_datasets",
        "logs": "alerts",
        "audio-visual_datasets": "audio_datasets",
        "cybercrime_infrastructure": "cybercrime_infrastructures"
    }
    return mapping.get(subcategory.strip().lower(), subcategory.strip().lower())

# Normalize and map similar subdomain names
def normalize_subdomain(subdomain):
    mapping = {
        "software and application security": "SAS",
        "human and societal aspects of security and privacy": "HSASP",
        "intrusion/anomaly detection and malware mitigation": "IDMM",
        "network security": "NS",
        "machine learning": "ML",
        "formal methods and theory of security": "FMTOS",
        "systems security": "SS",
        "privacy-preserving protocols": "PPP",
        "data management systems": "DMS",
        "cryptography": "CR"    
    }
    return mapping.get(subdomain.strip().lower(), subdomain.strip().lower())

# Load and prepare the dataset
def load_csv(file_path):
    return pd.read_csv(file_path, encoding="latin1")

def clean_and_normalize(df):
    df["Subcategory"] = df["Subcategory"].astype(str).apply(normalize_subcategory)
    df["Research Subdomain"] = df["Research_Subdomain"].astype(str).apply(normalize_subdomain)

    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    df = df.dropna(subset=["Year"])
    df["Year"] = df["Year"].astype(int)

    top_subdomains = df.groupby("Research Subdomain")["Count"].sum().nlargest(10).index
    df = df[df["Research Subdomain"].isin(top_subdomains)]

    df.to_csv("cleaned_top_10_research_subdomains.csv", index=False)
    print(" Saved cleaned data: cleaned_top_10_research_subdomains.csv")

    # Convert Year to integer, remove invalid and 2024 entries
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    df = df[df["Year"] != 2024]
    df = df.dropna(subset=["Year"])  # Drop rows with invalid years
    df["Year"] = df["Year"].astype(int)

    return df

# Top 10 Subcategories Per Year
def analyze_top_10_subcategories_per_year(df):
    top_subcategories_per_year = (
        df.groupby(["Year", "Subcategory"])["Count"]
        .sum().reset_index()
        .sort_values(["Year", "Count"], ascending=[True, False])
    )

    top_10_per_year = top_subcategories_per_year.groupby("Year").head(10).reset_index(drop=True)
    top_10_per_year.to_csv("top_10_subcategories_per_year.csv", index=False)
    print(" Saved: top_10_subcategories_per_year.csv")

    plt.figure(figsize=(14, 8))
    sns.barplot(
        data=top_10_per_year,
        x="Year", y="Count", hue="Subcategory",
        dodge=True, palette="tab10"
    )
    plt.title("Top 10 Subcategories Used in Each Year")
    plt.xlabel("Year")
    plt.ylabel("Dataset Count")
    plt.legend(title="Subcategory", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.grid(axis="y", linestyle="--", alpha=0.6)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig("top_10_subcategories_per_year.png", dpi=300, bbox_inches="tight")
    plt.show()

# Emerging & Declining Subcategories
def analyze_emerging_declining_subcategories(df):
    subcategory_trends = df.groupby(["Year", "Subcategory"])["Count"].sum().reset_index()
    subcategory_trends.to_csv("subcategory_trends_over_time.csv", index=False)
    print(" Saved: subcategory_trends_over_time.csv")

    pivot_table = subcategory_trends.pivot(index="Year", columns="Subcategory", values="Count").fillna(0)

    plt.figure(figsize=(14, 8))
    pivot_table.plot(kind="line", figsize=(14, 8), marker='o')
    plt.title("Emerging & Declining Dataset Subcategories Over Time")
    plt.xlabel("Year")
    plt.ylabel("Dataset Count")
    plt.legend(title="Subcategory", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.grid()
    plt.tight_layout()
    plt.savefig("emerging_declining_subcategories.png", dpi=300, bbox_inches="tight")
    plt.show()

# Subdomain vs Subcategory Correlation
def analyze_subcategory_subdomain_correlation(df):
    correlation_df = df.groupby(["Research Subdomain", "Subcategory"])["Count"].sum().reset_index()
    correlation_df.to_csv("subcategory_subdomain_correlation.csv", index=False)
    print(" Saved: subcategory_subdomain_correlation.csv")

    pivot = correlation_df.pivot(index="Research Subdomain", columns="Subcategory", values="Count").fillna(0)

    plt.figure(figsize=(14, 10))
    sns.heatmap(pivot, annot=False, cmap="coolwarm", linewidths=0.5, linecolor="gray")
    plt.title("Subdomain vs Dataset Subcategory Correlation")
    plt.xlabel("Subcategory")
    plt.ylabel("Research Subdomain")
    plt.xticks(rotation=45, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig("subcategory_subdomain_correlation.png", dpi=300, bbox_inches="tight")
    plt.show()

# Main Execution
file_path = "top_10_subdomain_subcategory_trends.csv"
df = load_csv(file_path)
df = clean_and_normalize(df)

analyze_top_10_subcategories_per_year(df)
analyze_emerging_declining_subcategories(df)
analyze_subcategory_subdomain_correlation(df)


UNIQUE DATASETs- REMOVING REPEATATION

In [None]:
import pandas as pd
import json


def normalize_dataset_name(name):
    """Normalize dataset names to standardize variations (case-insensitive, no spaces, no special characters)."""
    name = str(name).strip().lower()
    name = name.replace("-", "").replace("_", "").replace(" ", "")

    # Common dataset name variations 
    replacements = {
        "imdbdataset": "imdb",
        "imdbmoviereviews": "imdb"
    }

    return replacements.get(name, name)


def extract_unique_datasets(file_path, output_csv, output_json):
    """Extract unique datasets while allowing multiple entries if they have different category/subcategory."""
    
    # Load CSV file
    df = pd.read_csv(file_path, encoding="utf-8")

    # Normalize dataset names
    df["dataset_name_normalized"] = df["dataset_name"].apply(lambda x: normalize_dataset_name(str(x)))

    # Group by normalized dataset name while keeping category/subcategory distinct
    unique_datasets = df.groupby(["dataset_name_normalized", "category", "subcategory"]).agg({
        "dataset_name": "first",  # Keep the first original dataset name
        "availability": "first",
        "labeling_type": "first",
        "dataset_type": "first"
    }).reset_index()

    # Drop the extra column used for normalization
    unique_datasets.drop(columns=["dataset_name_normalized"], inplace=True)

    # Save to CSV
    unique_datasets.to_csv(output_csv, index=False, encoding="utf-8")

    # Save to JSON
    unique_datasets.to_json(output_json, orient="records", indent=4)

    print(f" Unique dataset list saved to: {output_csv} and {output_json}")
    return unique_datasets


# Run the Extraction
file_path = "datasets_summary_updated_final_new.csv"  # Your updated CSV input file
output_csv = "unique_datasets_updated_new_last_final.csv"  # Output file for unique datasets (CSV)
output_json = "unique_datasets_updated_new_last_final.json"  # Output file for unique datasets (JSON)

# Extract unique datasets
unique_datasets_df = extract_unique_datasets(file_path, output_csv, output_json)

# Display a preview of the extracted datasets
print(unique_datasets_df.head())


In [None]:
import pandas as pd

# Define category name variations to standardize them
category_mapping = {
    "attacker_related": "attacker_related",
    "user_and_organizational_characteristics": "user_and_organizational_characteristics",
    "macro_level_internet_characteristics": "macro_level_internet_characteristics",
    "visual_and_multimedia_data": "visual_and_multimedia_data",
    "defender_artifacts": "defender_artifacts",
    "user_organizational_characteristics": "user_and_organizational_characteristics",
    "macro_internet_characteristics": "macro_level_internet_characteristics",
    "user & organizational characteristics": "user_and_organizational_characteristics",
    "user_&_organizational_characteristics": "user_and_organizational_characteristics",
    "macro-level internet characteristics": "macro_level_internet_characteristics",
}

# Load the dataset
file_path = "datasets_summary_updated_final_new.csv"
df = pd.read_csv(file_path, encoding="latin1")

# Standardize the category column
df['category'] = df['category'].replace(category_mapping)

# Filter for only "Custom-created datasets, not shared", "Custom-created datasets but public", and "Custom-created datasets, but restricted"
filtered_df = df[df['availability'].isin(["Custom-created datasets, not shared", "Custom-created datasets but public", "Custom-created datasets, but restricted"])]

# Select only relevant columns
filtered_df = filtered_df[['title', 'dataset_name', 'category', 'subcategory', 'availability']]

# Save to CSV
output_csv = "custom_created_datasets_with_categories.csv"
filtered_df.to_csv(output_csv, index=False)

# Print output confirmation
print(f"\nExtracted {len(filtered_df)} rows of custom-created datasets with their categories and saved to {output_csv}.")


In [None]:
import pandas as pd
from collections import Counter

# Define category name variations to standardize them
category_mapping = {
    "attacker_related": "attacker_related",
    "user_and_organizational_characteristics": "user_and_organizational_characteristics",
    "macro_level_internet_characteristics": "macro_level_internet_characteristics",
    "visual_and_multimedia_data": "visual_and_multimedia_data",
    "defender_artifacts": "defender_artifacts",
    "user_organizational_characteristics": "user_and_organizational_characteristics",
    "macro_internet_characteristics": "macro_level_internet_characteristics",
    "user & organizational characteristics": "user_and_organizational_characteristics",
    "user_&_organizational_characteristics": "user_and_organizational_characteristics",
    "macro-level internet characteristics": "macro_level_internet_characteristics",
    "Alerts":"alerts",
}

# Load the dataset
file_path = "custom_created_datasets_with_categories.csv"
df = pd.read_csv(file_path, encoding="latin1")

# Standardize the category column
df['category'] = df['category'].replace(category_mapping)

# Filter for only "Custom-created datasets, not shared" and "Custom-created datasets but public"
filtered_df = df[df['availability'].isin(["Custom-created datasets, not shared", "Custom-created datasets but public", "Custom-created datasets, but restricted"])]

# Count occurrences of subcategories
subcategory_counts = Counter(filtered_df['subcategory'].dropna())

# Convert to DataFrame for saving
subcategory_df = pd.DataFrame(subcategory_counts.items(), columns=['Subcategory', 'Count'])

# Sort by count in descending order
subcategory_df = subcategory_df.sort_values(by="Count", ascending=False)

# Save results to CSV
output_csv = "most_common_subcategories_custom_datasets.csv"
subcategory_df.to_csv(output_csv, index=False)

# Display top subcategories
print("\nMost Common Subcategories for Custom-Created Datasets:")
print(subcategory_df.head(20))  # Show top 10 subcategories
print(f"\nResults saved to {output_csv}")


In [None]:
import pandas as pd

# Define category name variations to standardize them
category_mapping = {
    "attacker_related": "attacker_related",
    "user_and_organizational_characteristics": "user_and_organizational_characteristics",
    "macro_level_internet_characteristics": "macro_level_internet_characteristics",
    "visual_and_multimedia_data": "visual_and_multimedia_data",
    "defender_artifacts": "defender_artifacts",
    "user_organizational_characteristics": "user_and_organizational_characteristics",
    "macro_internet_characteristics": "macro_level_internet_characteristics",
    "Macro-Level Internet Characteristics": "macro_level_internet_characteristics",
    "user & organizational characteristics": "user_and_organizational_characteristics",
    "user_&_organizational_characteristics": "user_and_organizational_characteristics",
    "macro-level internet characteristics": "macro_level_internet_characteristics",
}

# Load the dataset
file_path = "datasets_summary_updated_final_new.csv"
df = pd.read_csv(file_path, encoding="latin1")

# Standardize the category colum
df['category'] = df['category'].replace(category_mapping)

# Standardize the labeling_type column (treat all variations of 'Not Mentioned' as one)
df['labeling_type'] = df['labeling_type'].replace({
    "Not mentioned": "not mentioned",
    "Not Mentioned": "not mentioned"
})

# Filter for only "Custom-created datasets, not shared" and "Custom-created datasets but public"
filtered_df = df[df['availability'].isin(["Custom-created datasets, not shared", "Custom-created datasets but public","Custom-created datasets, but restricted"])]

# Select relevant columns
filtered_df = filtered_df[['title', 'dataset_name', 'category', 'subcategory', 'availability', 'labeling_type', 'dataset_type']]

# Save to CSV
output_csv = "custom_created_datasets_with_categories.csv"
filtered_df.to_csv(output_csv, index=False)

# Count breakdown
availability_counts = filtered_df['availability'].value_counts()

# Labeling type breakdown
labeling_counts = filtered_df.groupby('availability')['labeling_type'].value_counts()

# Dataset type breakdown
dataset_type_counts = filtered_df.groupby('availability')['dataset_type'].value_counts()

# Category and subcategory breakdown
category_counts = filtered_df.groupby('availability')['category'].value_counts()
subcategory_counts = filtered_df.groupby('availability')['subcategory'].value_counts()

# Print results
print(f"\nExtracted {len(filtered_df)} rows of custom-created datasets with their categories and saved to {output_csv}.\n")

# Print availability counts
print("Custom-created dataset counts:")
for key, value in availability_counts.items():
    print(f"  {key}: {value}")

# Print labeling type breakdown
print("\nLabeling type breakdown for custom-created datasets (Standardized):")
print(labeling_counts)

# Print dataset type breakdown
print("\nDataset type breakdown for custom-created datasets:")
print(dataset_type_counts)

# Print category breakdown
print("\nCategory breakdown for custom-created datasets:")
print(category_counts)

# Print subcategory breakdown
print("\nSubcategory breakdown for custom-created datasets:")
print(subcategory_counts)


CDFs for All paper and HLDs

In [None]:
import pandas as pd
import json
import ast
import re
import matplotlib.pyplot as plt
from collections import Counter
import chardet

# File paths
INPUT_CSV = "results_merged_full_final.csv"
OUT_CSV = "cdf_and_raw_counts.csv"
OUT_PDF = "fractional_cdf_single_panel.pdf"

# Helper utilities
_fence_re = re.compile(r"```(?:json)?\s*|\s*```")

def strip_fences(s: str) -> str:
    if not isinstance(s, str):
        return ""
    lines = s.splitlines()
    inner = []
    for ln in lines:
        stripped = ln.strip()
        if (stripped.startswith("```") and stripped.endswith("```")) or stripped.startswith("```json"):
            continue
        inner.append(ln)
    return "\n".join(inner).strip()

def parse_published_year(cell: str):
    """Try JSON, literal_eval, then first 4-digit sequence."""
    if cell is None or (isinstance(cell, float) and pd.isna(cell)):
        return None
    s = strip_fences(str(cell).strip())
    # try JSON
    try:
        obj = json.loads(s)
        if isinstance(obj, dict) and "year" in obj:
            return int(obj["year"])
    except Exception:
        pass
    # try python literal
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, dict) and "year" in obj:
            return int(obj["year"])
    except Exception:
        pass
    # fallback: first 4-digit
    m = re.search(r"\d{4}", s)
    if m:
        return int(m.group(0))
    return None

def parse_domain_list(cell: str):
    """Parse the domain cell (json list of dicts) and return list of high_level_domain strings."""
    if cell is None or (isinstance(cell, float) and pd.isna(cell)):
        return []
    s = strip_fences(str(cell).strip())
    try:
        arr = json.loads(s)
        if not isinstance(arr, list):
            return []
        out = []
        for d in arr:
            if not isinstance(d, dict):
                continue
            high = d.get("high_level_domain", "").strip()
            # normalize known mislabel
            if high == "Software and application security":
                high = "Security and privacy"
            if high:
                out.append(high)
        return out
    except Exception:
        return []

# 1) Read CSV (detect encoding)
with open(INPUT_CSV, "rb") as f:
    enc = chardet.detect(f.read(100000))["encoding"]
print(f"Detected encoding: {enc}")

try:
    df = pd.read_csv(INPUT_CSV, encoding=enc)
except Exception:
    print(f"Failed to read with {enc}, retrying latin1")
    df = pd.read_csv(INPUT_CSV, encoding="latin1")

print(f"Loaded {len(df)} rows; columns: {list(df.columns)}")

# 2) Parse year and domains
possible_year_cols = [c for c in df.columns if "year" in c.lower()]
if not possible_year_cols:
    raise KeyError("No column containing 'year' found in CSV.")
year_col = possible_year_cols[0]
print(f"Using '{year_col}' as year column")

df["year"] = df[year_col].apply(parse_published_year)
missing = df["year"].isna().sum()
if missing:
    print(f"Warning: {missing} rows have no parsed year and will be dropped")
df = df.dropna(subset=["year"])
df["year"] = df["year"].astype(int)

df["HLDs"] = df["domain"].apply(parse_domain_list)

# 3) Top-5 HLDs overall
all_hlds = [h for sub in df["HLDs"] for h in sub]
hld_counts = Counter(all_hlds)
top5_hlds = [h for h, _ in hld_counts.most_common(5)]
print("Top-5 HLDs:", top5_hlds)

# 4) Cumulative raw counts per year
years = sorted(df["year"].unique())
cum_total_raw = []
cum_counts_hlds_raw = {h: [] for h in top5_hlds}

for yr in years:
    subset = df[df["year"] <= yr]
    cum_total_raw.append(subset.shape[0])
    for h in top5_hlds:
        cnt = subset[subset["HLDs"].apply(lambda L: h in L)].shape[0]
        cum_counts_hlds_raw[h].append(cnt)

# 5) Percentages relative to final value
def to_percentages(raw_list):
    final = raw_list[-1] if raw_list else 0
    if final == 0:
        return [0.0] * len(raw_list)
    return [(x / final) * 100.0 for x in raw_list]

cum_total_pct = to_percentages(cum_total_raw)
cum_counts_hlds_pct = {h: to_percentages(cum_counts_hlds_raw[h]) for h in top5_hlds}

# 6) Save CSV
data = {"Year": years, "Total_Raw": cum_total_raw, "Total_Pct": cum_total_pct}
for h in top5_hlds:
    data[h.replace(" ", "_") + "_Raw"] = cum_counts_hlds_raw[h]
    data[h.replace(" ", "_") + "_Pct"] = cum_counts_hlds_pct[h]

cdf_df = pd.DataFrame(data)
cdf_df.to_csv(OUT_CSV, index=False)
print(f"Saved CSV -> {OUT_CSV}")

# 7) Fractional CDF plot

total_final = cum_total_raw[-1] if cum_total_raw else 1
frac_total = [x / total_final for x in cum_total_raw]

frac_counts_hlds = {}
for h in top5_hlds:
    raw = cum_counts_hlds_raw[h]
    final = raw[-1] if raw and raw[-1] > 0 else 1
    frac_counts_hlds[h] = [x / final for x in raw]

plt.figure(figsize=(8, 5))
plt.plot(years, frac_total, marker="s", linestyle="-", label="All Papers", linewidth=2, color="orange")

colors = ["black", "forestgreen", "royalblue", "crimson", "teal"]
line_styles = ["-", ":", "-.", "--", (0, (3,1,1,1))]
for idx, h in enumerate(top5_hlds):
    plt.plot(years, frac_counts_hlds[h], marker="o", linestyle=line_styles[idx],
             color=colors[idx], label=h, linewidth=1.6)

plt.xlabel("Year", fontsize=11)
plt.ylabel("Cumulative Fraction (0–1)", fontsize=11)
plt.title("Fractional CDF (2015–2024*)")
plt.ylim(0, 1.02)
plt.xlim(min(years)-0.5, max(years)+0.5)
plt.xticks(years, rotation=45)
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(loc="upper left", fontsize="small", framealpha=0.85)
plt.text(0.5, -0.12, "* 2024 is partial (through Sept. 2024)", transform=plt.gca().transAxes, ha="center", fontsize=8)
plt.tight_layout()
plt.savefig(OUT_PDF, bbox_inches="tight")
plt.close()
print(f"Saved PDF -> {OUT_PDF}")


Domain Coverage

In [None]:
import pandas as pd
import numpy as np
import json
import ast

# Config / mapping
conference_mapping = {
    "USS": "USENIX",       "USENIX ATC": "USENIX",   "ATC": "USENIX",
    "FAST": "USENIX",      "NSDI": "USENIX",         "OSDI": "USENIX",
    "NDSS": "NDSS",        "MADWeb": "NDSS",
    "ACSAC": "ACSAC",      "AISCC": "ACSAC",
    "SP": "SP",            "TPDS": "SP",             "TASLP": "SP",
    "CCS": "CCS",          "ASIA CCS": "CCS",        "CCSNONE": "CCS",
}


input_file = "results_merged_full_final.csv"   # adjust path if needed
output_file1 = "domain_coverage_by_conference_percentiles_final.csv"
output_file2 = "non_sp_counts_per_conference.csv"

# Helpers
def strip_fences(s: str) -> str:
    if not isinstance(s, str):
        return ""
    lines = s.splitlines()
    inner = []
    for ln in lines:
        stripped = ln.strip()
        if (stripped.startswith("```") and stripped.endswith("```")) or stripped.startswith("```json"):
            continue
        if stripped == "```":
            continue
        inner.append(ln)
    return "\n".join(inner).strip()

def parse_conference_field(raw):
    if raw is None or (isinstance(raw, float) and pd.isna(raw)):
        return None
    s = str(raw).strip()
    if not s:
        return None
    s = strip_fences(s)
    # try JSON
    try:
        obj = json.loads(s)
        if isinstance(obj, dict) and "conference" in obj:
            raw_conf = str(obj["conference"]).strip()
            return conference_mapping.get(raw_conf, raw_conf) if raw_conf else None
    except Exception:
        pass
    # try literal eval
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, dict) and "conference" in obj:
            raw_conf = str(obj["conference"]).strip()
            return conference_mapping.get(raw_conf, raw_conf) if raw_conf else None
    except Exception:
        pass
    bare = s.strip("\"' ")
    return conference_mapping.get(bare, bare) if bare else None

def parse_domain_list(cell):
    if cell is None or (isinstance(cell, float) and pd.isna(cell)):
        return []
    s = str(cell).strip()
    if not s:
        return []
    s = strip_fences(s)
    try:
        arr = json.loads(s)
        if not isinstance(arr, list):
            return []
        out = []
        for d in arr:
            if not isinstance(d, dict):
                continue
            high = d.get("high_level_domain", "")
            sub  = d.get("subdomain", "")
            if not isinstance(high, str):
                high = str(high) if high is not None else ""
            if not isinstance(sub, str):
                sub = str(sub) if sub is not None else ""
            high = high.strip()
            sub  = sub.strip().lower()
            # fix known mislabel
            if high == "Software and application security" or sub == "software and application security":
                high = "Security and privacy"
            if high:
                out.append(high)
        return out
    except Exception:
        try:
            arr = ast.literal_eval(s)
            if isinstance(arr, list):
                out = []
                for d in arr:
                    if isinstance(d, dict):
                        high = d.get("high_level_domain", "")
                        if isinstance(high, str) and high.strip():
                            out.append(high.strip())
                return out
        except Exception:
            return []
    return []

# Main
# 1) load
try:
    df = pd.read_csv(input_file, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(input_file, encoding="latin1")

print(f"Initially loaded {len(df)} rows from {input_file}")

# 2) parse conference (conservative, do not drop rows)
df["ConferenceRaw"] = df["conference_name"].apply(parse_conference_field)
print(f"After parsing Conference, # non-null = {df['ConferenceRaw'].notna().sum()} (should be {len(df)})")

# 3) parse domains
df["All_HLDs"] = df["domain"].apply(parse_domain_list)

# 4) deduplicate per paper (keep insertion order -> unique)
df["Unique_HLDs"] = df["All_HLDs"].apply(lambda L: list(dict.fromkeys(L)) if isinstance(L, list) else [])

# 5) num distinct HLDs per paper
df["num_hlds"] = df["Unique_HLDs"].apply(len)
print(f"After parsing domains, total rows = {len(df)} (should be {len(df)})")

# 6) explode to (paper, one HLD) rows
exploded = df.explode("Unique_HLDs").rename(columns={"Unique_HLDs": "HLD"})

# 7) pivot: # papers with that HLD (unique HLD counts)
domain_counts = (
    exploded
    .dropna(subset=["HLD"])
    .groupby(["ConferenceRaw", "HLD"])
    .size()
    .unstack(fill_value=0)
)

# ensure conferences that exist in parsed ConferenceRaw but not in domain_counts are present
paper_counts_series = df["ConferenceRaw"].value_counts().sort_index()
for conf in paper_counts_series.index:
    if conf not in domain_counts.index:
        domain_counts.loc[conf] = 0
domain_counts = domain_counts.reindex(sorted(domain_counts.index))

# 8) total number of (paper->HLD) tag assignments per conference
hld_tag_counts = exploded.dropna(subset=["HLD"]).groupby("ConferenceRaw").size()

# 9) Papers column (exact paper counts from parsed ConferenceRaw), AvgDom column (Avg #distinct HLDs per paper)
paper_counts_aligned = paper_counts_series.reindex(domain_counts.index).fillna(0).astype(int)
domain_counts["Papers"] = paper_counts_aligned
avg_domains = (hld_tag_counts.reindex(domain_counts.index).fillna(0) / paper_counts_aligned.replace(0, np.nan)).round(2)
avg_domains = avg_domains.fillna(0.0)
domain_counts["AvgDom"] = avg_domains

# 10) compute percentiles P50 and P90 per conference (on num_hlds distribution)
percentiles = []
for conf in sorted(paper_counts_series.index.tolist()):
    arr = df.loc[df["ConferenceRaw"] == conf, "num_hlds"].to_numpy()
    if arr.size == 0:
        p50, p90 = np.nan, np.nan
    else:
        p50 = float(np.percentile(arr, 50))
        p90 = float(np.percentile(arr, 90))
    percentiles.append({"Conference": conf, "P50": round(p50,2) if not np.isnan(p50) else np.nan, "P90": round(p90,2) if not np.isnan(p90) else np.nan})

pct_df = pd.DataFrame(percentiles).set_index("Conference")
pct_df = pct_df.reindex(domain_counts.index)

# 11) merge percentiles into domain_counts
out = domain_counts.copy()
out["P50"] = pct_df["P50"]
out["P90"] = pct_df["P90"]
out["Papers"] = out["Papers"].astype(int)

# 12) Prepare final ordering
out_reset = out.reset_index().rename(columns={"ConferenceRaw": "Conference", "index": "Conference"})
extras = {"Conference", "Papers", "AvgDom", "P50", "P90"}
hld_cols = [c for c in out_reset.columns if c not in extras]
hld_cols = sorted(hld_cols)
final_cols = ["Conference", "Papers"] + hld_cols + ["AvgDom", "P50", "P90"]
final_cols = [c for c in final_cols if c in out_reset.columns]
out1 = out_reset.loc[:, final_cols]


# 14) Append only the Total row (no "Avg per domain" row)
numeric_cols = [c for c in out1.columns if c != "Conference" and pd.api.types.is_numeric_dtype(out1[c])]
total_vals = out1[numeric_cols].sum(numeric_only=True)
total_row = {c: (int(total_vals[c]) if pd.api.types.is_integer_dtype(out1[c]) else float(round(total_vals[c],2))) for c in numeric_cols}
total_row["Conference"] = "Total"
# ensure any missing final columns exist in total_row
for col in out1.columns:
    if col not in total_row:
        total_row[col] = "--"

append_df = pd.DataFrame([total_row], columns=out1.columns)
out_final = pd.concat([out1, append_df], ignore_index=True, sort=False)

# 15) save
out_final.to_csv(output_file1, index=False, float_format="%.2f")
nonsp_df.to_csv(output_file2, index=False)
print(f"Wrote files:\n - {output_file1}\n - {output_file2}")

print("\n=== Preview (last 6 rows) ===")
print(out_final.tail(6).to_string(index=False))

