In [None]:
import os
import json
import pandas as pd
import re
import csv
from openai import OpenAI
from dotenv import load_dotenv

# Load OpenAI API Key
load_dotenv("api_key.env")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# File paths
CUSTOM_DATASETS_FILE = "custom_created_datasets_results.csv"
EXTRACTED_PAPERS_FILE = "extracted_test_papers_new_NEWEST.jsonl"
OUTPUT_CSV = "final_results_custom_created.csv"
OUTPUT_JSONL = "final_results_custom_created.jsonl"

# Load papers
def load_papers_from_jsonl(file_path):
    papers = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                papers.append(json.loads(line.strip()))
            except json.JSONDecodeError:
                continue
    return papers

# Normalize
def normalize_title(title):
    return re.sub(r'\s+', ' ', title.lower().strip())

def extract_title(json_str):
    try:
        data = json.loads(json_str)
        return data.get("title", "").strip()
    except:
        return json_str.strip()

# Load and normalize
papers = load_papers_from_jsonl(EXTRACTED_PAPERS_FILE)
custom_df = pd.read_csv(CUSTOM_DATASETS_FILE)
custom_df["title"] = custom_df["title"].apply(extract_title)
custom_titles = set(custom_df["title"].apply(normalize_title))

# Match papers
matched_papers = []
for paper in papers:
    norm_title = normalize_title(paper['title'])
    if norm_title in custom_titles:
        matched_papers.append(paper)

print(f"Matched {len(matched_papers)} papers with custom datasets.")

# Task list
tasks = [
    "dataset_creation_reason",
    "data_collection_method",
    "scale_of_data",
    "data_sources",
    "data_collection_purpose"
]


def generate_all_dataset_details_prompt(paper, dataset_row, task):
    title = paper['title']
    content = paper['content']
    dataset_name = dataset_row['dataset_name']
    category = dataset_row['category']
    subcategory = dataset_row['subcategory']
    availability = dataset_row['availability']
    labeling_type = dataset_row['labeling_type']
    dataset_type = dataset_row['dataset_type']

    # Dataset Creation Reason Extraction
    if task == "dataset_creation_reason":
        return f"""
        You are tasked with identifying the **reason for creating the dataset** in the cybersecurity paper titled "{title}".
        
        Dataset details:
        - Dataset Name: {dataset_name}
        - Category: {category}
        - Subcategory: {subcategory}
        - Availability: {availability}
        - Labeling Type: {labeling_type}
        - Dataset Type: {dataset_type}
        
        Guidelines:
        
        Important: If the paper has used both publicly available datasets and also created a new dataset (i.e., a custom-created dataset), you should focus 
        **only on the custom-created dataset** when answering this task.

        \t1. Identify why the dataset was created in the paper. Look for statements such as:
        \t   - No publicly available dataset
        \t   - Outdated datasets
        \t   - Limited coverage
        \t   - Need for labeled data
        \t   - Domain-specific constraints
        \t   - Need for large-scale data
        \t   - To detect a new type of fraud or attack
        \t   - Need for synthetic data
        \t   - Other reasons not mentioned in the list.
        
        \t2. Select the most applicable reason(s) for dataset creation from the options:
        \t   **(A) No publicly available dataset**  
        \t   **(B) Outdated datasets**  
        \t   **(C) Limited coverage**  
        \t   **(D) Need for labeled data**  
        \t   **(E) Domain-specific constraints**  
        \t   **(F) Need for large-scale data**  
        \t   **(G) To detect a new type of fraud or attack
        \t   **(H) Need for synthetic data**  
        \t   **(I) Other reason(s)**  

        \t3. These example are from our dataset, please check clearly and return the same output for these papers. Be vigilant, while processing these papers below: 
        
        - Example 1: In the paper "POISED: Spotting Twitter Spam Off the Beaten Paths", the authors created a custom dataset using the Twitter API and manually labeled tweet clusters to support their spam detection system. 
        The dataset was large-scale (1.3M tweets) and required labeling, but it was not shared publicly.
        "POISED: Spotting Twitter Spam Off the Beaten Paths" (dataset_creation_reason): {{ "dataset_creation_reason": ["D", "F"]}}

        - Example 2: In the paper "Dissecting Click Fraud Autonomy in the Wild", they created dataset to detect humanoid attacks, which are novel and not captured in prior datasets.
        "Dissecting Click Fraud Autonomy in the Wild" (dataset_creation_reason): {{ "dataset_creation_reason": ["F", "G"]}}

        -Example 3: In the paper "Deterrence of Intelligent DDoS via Multi-Hop Traffic Divergence". They collected a 49.8 TB dataset over 24 hours to evaluate EID's effectiveness at large scale and the dataset supports
        detecting intelligent, strategic DDoS attacks that evolve via adversarial machine learning,  a novel threat type the authors aim to deter.
        "Deterrence of Intelligent DDoS via Multi-Hop Traffic Divergence" (dataset_creation_reason): {{"dataset_creation_reason": ["A", "F", "G"]}}

        - Example 4: In the paper "Experimenting with Zero-Knowledge Proofs of Training", the authors generated a synthetic dataset of 262,144 records with 1,024 features (4GB total) to benchmark the cryptographic cost and performance of their zero-knowledge proof of training (zkPoT) protocol, since it's synthetically created so it will lie in **(H)** Category from above given list.
        "Experimenting with Zero-Knowledge Proofs of Training" (dataset_creation_reason): {{ "dataset_creation_reason": ["H", "F", "I"]}}

        - Example 5: In the paper "Scamdog Millionaire: Detecting E-commerce Scams in the Wild", the authors created a custom ground-truth dataset of 8,944 e-shop domains, consisting of 3,765 confirmed scam domains and 5,179 benign ones. The dataset was assembled using crowdsourced reviews, manual analyst verification, and telemetry data, addressing the lack of large-scale labeled datasets for scam detection. 
        "Scamdog Millionaire: Detecting E-commerce Scams in the Wild" (dataset_creation_reason): {{ "dataset_creation_reason": ["A", "D", "F"] }}

        - Example 6: In the paper "Detecting Weak Keys in Manufacturing Certificates: A Case Study", the authors custom created dataset, which consist of 226 million manufacturing certificates. It's a large scale dataset so it will lie in **(F)** Category from above given list.
        "Detecting Weak Keys in Manufacturing Certificates" (dataset_creation_reason): {{ "dataset_creation_reason": ["A", "F", "I"] }}

        - Example 7: In the paper "Five Years of the Right to be Forgotten", the authors created a large-scale dataset of over 3.2 million URL delisting requests submitted to Google between 2014 and 2019. The dataset was built to address the lack of publicly available data,
        enable large-scale transparency analysis, and support legal and policy research. The authors also manually labeled the data, categorizing requesters, content types, and decisions.
        "Five Years of the Right to be Forgotten" (dataset_creation_reason): {{"dataset_creation_reason": ["A", "D", "E", "F", "I"] }}


        \t4. Do NOT hallucinate any reasons. Select a reason ONLY IF there is **clear or strongly implied evidence** in the paper text.

        \t5. If no applicable reason is found from (A)–(H), and the paper still clearly mentions another rationale, return:
        
        {{
            "dataset_creation_reason": ["I"]
        }}

        \t6. If no reason is mentioned or implied at all, return:
        
        {{
            "dataset_creation_reason": "not mentioned"
        }}


        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your response must be returned in the following JSON format:
        {{
            "dataset_creation_reason": ["A", "B"]
        }}

        Your response:
        """

    # Task 2: Dataset Collection Method Extraction

    elif task == "data_collection_method":
        return f"""
        You are tasked with identifying the **method used to collect the dataset** in the cybersecurity paper titled "{title}".

        Dataset details:
        - Dataset Name: {dataset_name}
        - Category: {category}
        - Subcategory: {subcategory}
        - Availability: {availability}
        - Labeling Type: {labeling_type}
        - Dataset Type: {dataset_type}

        Guidelines:

        Important: If the paper has used both publicly available datasets and also created a new dataset (i.e., a custom-created dataset), you should focus 
        **only on the custom-created dataset** when answering this task.

        \t1. Identify the method used to collect the custom-created dataset, and select ONLY IF there's clear or strongly implied evidence in the paper. Choose from:
        \t   **DC1:** Web Scraping  
        \t   **DC2:** Network Traffic Logs  
        \t   **DC3:** Malware Repositories  
        \t   **DC4:** User-Contributed Data  
        \t   **DC5:** IoT / Embedded System Logs  
        \t   **DC6:** Manual Labeling  
        \t   **DC7:** Darknet / Underground Markets  
        \t   **DC8:** Government / Institutional Sources  
        \t   **DC9:** Simulated / Synthetic Data  
        \t   **DC10:** Enterprise Logs  
        \t   **DC11:** Mobile / App Data  
        \t   **DC12:** Cloud / Hosting Logs  
        \t   **DC13:** API-based Data Collection
        \t   **DC14:** Other
        \t   **DC15:** Binary Static & Dynamic Analysis Logs

         \t\t. Clarification:
         
         - IMPORTANT: If the paper says "crawled" or "crawling", **check if it's used with an API**. 
         - If it says “used Twitter API to crawl timelines” or “crawled using Play API,” assign **DC13 (API-based Collection)**.
         - Only assign **DC1 (Web Scraping)** if crawling is done directly from websites or app store HTML **without** using APIs.

        \t2. Examples for clarity:

        - Example 1: In the paper "POISED: Spotting Twitter Spam Off the Beaten Paths", the authors used the Twitter API to crawl user timelines. This is API-based collection.
        "POISED: Spotting Twitter Spam Off the Beaten Paths" (data_collection_method): {{ "data_collection_method": ["DC13"] }}

        - Example 2: In the paper "Dissecting Click Fraud Autonomy in the Wild", the authors crawled 10K apps from Google Play (without using an API). This is web scraping.
        "Dissecting Click Fraud Autonomy in the Wild" (data_collection_method): {{ "data_collection_method": ["DC1", "DC11"] }}

        - Example 3: In "The Effectiveness of Security Interventions on GitHub", the dataset was collected through GitHub’s REST API.
        "The Effectiveness of Security Interventions on GitHub" (data_collection_method): {{"data_collection_method": ["DC13"]}}

        - Example 4: In the paper "AFLGuard: Byzantine-robust Asynchronous Federated Learning", the authors constructed a synthetic dataset using a multivariate Gaussian distribution to
        simulate federated learning scenarios.
        "AFLGuard: Byzantine-robust Asynchronous Federated Learning" (data_collection_method): {{"data_collection_method": ["DC9"]}}

        
        \t2. Only choose DC8 or DC9 if the paper explicitly mentions **government/institutional datasets** or **synthetic/simulated generation**.

        \t3. Do not hallucinate a method. Only select from the list if the evidence in the paper clearly supports it. If no valid collection method is identifiable, return:

        {{
            "data_collection_method": ["DC14"]
        }}


        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your response must be returned in the following JSON format:
        {{
            "data_collection_method": ["DC1"]
        }}

        Your response:
        """

     # Task 3: Dataset Creation Purpose Extraction
    
    elif task == "data_collection_purpose":
         return f"""
         
         You are tasked with identifying the **purpose for which the dataset was used in the study** in the cybersecurity paper titled "{title}".

         Guidelines:

         Important: If the paper has used both publicly available datasets and also created a new dataset (i.e., a custom-created dataset), you should focus **only on the custom-created dataset** when answering this task.

         \t1. Identify the intended research use or experimental goal for the dataset. Look for statements such as:
         \t   - Needed a real-world dataset for intrusion detection
         \t   - Created to study real-world attack/fraud data
         \t   - Needed a labeled dataset for supervised learning
         \t   - Created to test adversarial security models
         \t   - Needed synthetic data for analysis
         \t   - To study message propagation in social networks
         \t   - To evaluate model robustness under adversarial machine learning attacks
         \t   - Training ML models for binary recovery
         \t   - Created to study real-world vulnerabilities in web applications.
         \t   - Created to study real-world vulnerabilities in Android applications.
         \t   - Created to study large-scale trends, user behaviors, or operational patterns in real-world systems
         \t   - Created to study real-world website fingerprinting attacks
         \t   - Created to study real-world vulnerabilities in manufacturing certificates
         \t   - Created to study real-world vulnerabilities in mobile authentication through shoulder surfing.
         \t   - Created to study zero-knowledge proof of training for logistic regression.
         \t   - Created to study real-world vulnerabilities in online collaboration services.
         \t   - Created to study real-world audio transmission behavior of IoT devices
         \t   - Other purposes specified in the paper
         
         \t2. Json Output Example:
         
         **Note: These examples are taken from our output before reading the content, chech below examples, since we already checked them manually so they are correct.**
         
         - Example 1: In the paper "POISED: Spotting Twitter Spam Off the Beaten Paths", the authors created a custom dataset using the Twitter API and manually labeled tweet clusters to support a spam detection model. The dataset was used to study message propagation
         across Twitter communities and evaluate adversarial robustness.
         'POISED: Spotting Twitter Spam Off the Beaten Paths' (data_collection_purpose): {{ "data_collection_purpose": "Created to study real-world attack data." }}
         
         - Example 2: In the paper "Debin: Predicting Debug Information in Stripped Binaries", the authors collected non-stripped ELF binaries to train machine learning models for binary recovery tasks.
         'Debin: Predicting Debug Information in Stripped Binaries' (data_collection_purpose): {{ "data_collection_purpose": "Training ML models for binary recovery." }}
         
         - Example 3: In the paper "Experimenting with Zero-Knowledge Proofs of Training", the authors have created a 'Synthetic dataset', which was generated for benchmarking/training purposes to test zero-knowledge proof of training protocol.
         'Experimenting with Zero-Knowledge Proofs of Training' (data_collection_purpose): {{ "data_collection_purpose": "Needed synthetic data for analysis" }}
         
         \t3. Do not hallucinate a purpose. Only select a purpose if it is **clearly stated or strongly implied** in the paper.
         If no valid purpose is identifiable, return:
         {{
            "data_collection_purpose": "not mentioned"
         }}
         If the purpose does not match any of the listed options, return:
        {{
            "data_collection_purpose": "Other purposes specified in the paper"
        }}
        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        
        Your response must be returned in the following JSON format:
        {{
            "data_collection_purpose": "Created to study real-world attack data."
        }}

        Your response:
        """


    # Task 4: Scale of Data Extraction
    elif task == "scale_of_data":
        return f"""
        You are tasked with identifying the **scale of the dataset** in the cybersecurity paper titled "{title}".
        
        Guidelines:
        \t1. Identify the scale of the dataset. Look for numbers, such as:
        \t   - Number of images, logs, or samples collected
        \t   - For example: "50,000 logs collected", "10,000 images captured"

        \t2. Below are some examples from our dataset, make sure to give same output of below mentioned papers, as they were manually verified. Be very careful

        
        Json Output Examples:
        
        - Example 1: In the paper "Latent Typing Biometrics in Online Collaboration Services", the authors collected approximately 10 typing instances per user, each containing 1,000–1,500 characters, across a total of 74 participants.

        "Latent Typing Biometrics in Online Collaboration Services" (scale_of_data): {{ "scale_of_data": "74 users × 10 samples each (1,000–1,500 characters per instance)" }}

        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your response must be returned in the following JSON format:
        {{
            "scale_of_data": "50,000 logs collected"
        }}

        Your response:
        """

    # Task 5: Data Sources Extraction
    elif task == "data_sources":
        return f"""
        You are tasked with identifying the **sources from which the data was collected** in the cybersecurity paper titled "{title}".
        
        Guidelines:
        
        Important: If the paper has used both publicly available datasets and also created a new dataset (i.e., a custom-created dataset), you should focus 
        **only on the custom-created dataset** when answering this task.
        
        \t1. Identify the sources from which the data was collected. Common sources include:
        \t   - Network logs
        \t   - GooglePlay
        \t   - Social media
        \t   - Malware repositories
        \t   - IoT devices
        \t   - University datasets
        \t   - Public datasets
        \t   - Alexa Top Sites
        \t   - Reddit
        \t   - Wikipedia
        \t   - Simulated statistical model
        \t   - Other sources mentioned in the paper
        

        \t2. Below are some examples from our dataset, make sure to give same output of below mentioned papers, as they were manually verified. Be very careful
        
        Json Output Examples: 
        
        **Note: These examples are taken from our output before reading the content, chech below examples, since we already checked them manually so they are correct.**
        
        - Example 1: In the paper "Dissecting Click Fraud Autonomy in the Wild", the data source was Google Play and Huawei AppGallery.

        "Dissecting Click Fraud Autonomy in the Wild" (data_sources): {{ "data_sources": ["Google Play", "Huawei AppGallery"] }}

        - Example 2: In the paper "AFLGuard: Byzantine-robust Asynchronous Federated Learning", the synthetic dataset was entirely generated by the authors using simulated statistical models (i.e., multivariate Gaussians with random parameters).

        "FLGuard: Byzantine-robust Asynchronous Federated Learning" (data_sources): {{ "data_sources": ["Simulated statistical model"] }}

        - Example 3: In the paper "Debin: Predicting Debug Information in Stripped Binaries", the dataset was collected from Linux debug symbol packages (e.g., coreutils, dpkg, gcc) available in public Linux distributions.

        "Debin: Predicting Debug Information in Stripped Binaries" (data_sources): {{ "data_sources": ["Linux debug symbol packages"] }}



         \t3. Do not hallucinate or guess the data sources. Select a source **only if it is explicitly stated or strongly implied** in the paper.

         \t4. If no valid source is found, return:
         
         {{
            "data_sources": "not mentioned"
         }}

         \t5. If a data source is mentioned but it doesn’t match any of the above categories, return:
         {{
            "data_sources": ["Other sources mentioned in the paper"]
         }}


        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your response must be returned in the following JSON format:
        {{
            "data_sources": ["Network logs", "University dataset"]
        }}

        Your response:
        """


    else:
        raise ValueError("Invalid task")



test

In [None]:
def process_custom_dataset_tasks(papers, custom_df, tasks):
    results = []

    for i, paper in enumerate(papers):
        norm_title = normalize_title(paper['title'])
        matched_datasets = custom_df[custom_df['title'].apply(normalize_title) == norm_title]

        for _, dataset_row in matched_datasets.iterrows():
            result_entry = {
                "title": paper['title'],
                "dataset_name": dataset_row['dataset_name']
            }

            for task in tasks:
                prompt = generate_all_dataset_details_prompt(paper, dataset_row, task)
                print(f" Processing {paper['title']} / {dataset_row['dataset_name']} [{task}]")

                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=0.2,
                        max_tokens=1500
                    )
                    content = response.choices[0].message.content.strip()
                    print(" Response:", content)
                    try:
                        parsed_json = json.loads(content)
                        result_entry[task] = parsed_json.get(task, content)
                    except:
                        result_entry[task] = content
                except Exception as e:
                    print(f" Error with {paper['title']} [{task}]:", e)
                    result_entry[task] = f"error: {str(e)}"

            results.append(result_entry)

    return results

# Save

def save_results(results, csv_path, jsonl_path):
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "dataset_name"] + tasks)
        writer.writeheader()
        writer.writerows(results)

    with open(jsonl_path, "w", encoding="utf-8") as f:
        for row in results:
            f.write(json.dumps(row) + "\n")

# Run full pipeline
results = process_custom_dataset_tasks(matched_papers[:20], custom_df, tasks)
save_results(results, OUTPUT_CSV, OUTPUT_JSONL)

print(f" Saved {len(results)} records to {OUTPUT_CSV} and {OUTPUT_JSONL}")


In [None]:
def process_custom_dataset_tasks(papers, custom_df, tasks):
    results = []

    for i, paper in enumerate(papers):
        norm_title = normalize_title(paper['title'])
        matched_datasets = custom_df[custom_df['title'].apply(normalize_title) == norm_title]

        for _, dataset_row in matched_datasets.iterrows():
            result_entry = {
                "title": paper['title'],
                "dataset_name": dataset_row['dataset_name']
            }

            for task in tasks:
                prompt = generate_all_dataset_details_prompt(paper, dataset_row, task)
                print(f" Processing {paper['title']} / {dataset_row['dataset_name']} [{task}]")

                try:
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[{"role": "user", "content": prompt}],
                        temperature=0.2,
                        max_tokens=1500
                    )
                    content = response.choices[0].message.content.strip()
                    print(" Response:", content)
                    try:
                        parsed_json = json.loads(content)
                        result_entry[task] = parsed_json.get(task, content)
                    except:
                        result_entry[task] = content
                except Exception as e:
                    print(f" Error with {paper['title']} [{task}]:", e)
                    result_entry[task] = f"error: {str(e)}"

            results.append(result_entry)

    return results

# Save

def save_results(results, csv_path, jsonl_path):
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "dataset_name"] + tasks)
        writer.writeheader()
        writer.writerows(results)

    with open(jsonl_path, "w", encoding="utf-8") as f:
        for row in results:
            f.write(json.dumps(row) + "\n")

# Run full pipeline
results = process_custom_dataset_tasks(matched_papers, custom_df, tasks)
save_results(results, OUTPUT_CSV, OUTPUT_JSONL)

print(f" Saved {len(results)} records to {OUTPUT_CSV} and {OUTPUT_JSONL}")


In [30]:

import pandas as pd
import json
import ast
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

IN_FILE = "final_results_custom_created.csv"
OUT_DIR = Path("raw_counts")
OUT_DIR.mkdir(exist_ok=True)

# Load the CSV
df = pd.read_csv(IN_FILE, encoding="cp1252", on_bad_lines="skip")

# Fields you want to analyze
fields = [
    "dataset_creation_reason",
    "data_collection_method",
    "data_collection_purpose",
    "data_sources",
]

codefence_start = re.compile(r"^\s*```(?:json)?\s*$", re.IGNORECASE)
codefence_end = re.compile(r"^\s*```\s*$")

def strip_code_fences(s: str) -> str:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return ""
    lines = str(s).splitlines()
    # remove leading ```json or ```
    if lines and codefence_start.match(lines[0]):
        lines = lines[1:]
    # remove trailing ```
    if lines and codefence_end.match(lines[-1]):
        lines = lines[:-1]
    s2 = "\n".join(lines).strip()
    s2 = s2.replace('""', '"')  # collapse doubled quotes from CSV
    return s2

# Normalize and safely parse messy entries
def safe_parse(value, key):
    if pd.isna(value):
        return []
    value = strip_code_fences(value).strip().strip("`").strip('"').strip("'")

    if value == "":
        return []

    try:
        parsed = json.loads(value)
        if isinstance(parsed, dict):
            if key in parsed:
                val = parsed.get(key)
            else:
                # fall back to first value if present
                val = next(iter(parsed.values())) if parsed else []
            if isinstance(val, list):
                return val
            if isinstance(val, str):
                return [val]
            return [val]
        elif isinstance(parsed, list):
            return parsed
        elif isinstance(parsed, str):
            return [parsed]
    except Exception:
        pass

    # Try as Python literal (e.g., ['A', 'B'])
    try:
        parsed = ast.literal_eval(value)
        if isinstance(parsed, list):
            return parsed
        elif isinstance(parsed, str):
            return [parsed]
    except Exception:
        pass

    # Fallback: comma-separated string
    if "," in value:
        return [v.strip() for v in value.split(",") if v.strip()]
    return [value]

field_counts = {}
for field in fields:
    counter = Counter()
    # If the column is missing, skip gracefully
    if field not in df.columns:
        field_counts[field] = counter
        continue
    for val in df[field]:
        items = safe_parse(val, field)
        for item in items:
            norm = str(item).strip()
            if norm:
                counter[norm] += 1
    field_counts[field] = counter

# Plot + Save results
def plot_and_save_counts(field, counter, top_n=20):
    items = counter.most_common(top_n)
    if not items:
        print(f"[{field}] No data found.")
        return

    labels, counts = zip(*items)
    # Save counts to CSV
    out_csv = OUT_DIR / f"{field}_counts.csv"
    pd.DataFrame({"Label": labels, "Frequency": counts}).to_csv(out_csv, index=False)
    print(f"Saved counts → {out_csv}")

    # Plot to PNG
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(counts), y=list(labels), palette="Blues_d")
    plt.title(f"Top {top_n} Labels in {field}", fontsize=14, fontweight="bold")
    plt.xlabel("Frequency")
    plt.ylabel("Label")
    plt.tight_layout()
    out_png = OUT_DIR / f"{field}_top_labels.png"
    plt.savefig(out_png, dpi=300)
    plt.close()
    print(f"Saved plot   → {out_png}")

# Process each field
for field in fields:
    print(f"\n{field}")
    top10 = field_counts[field].most_common(10)
    for label, count in top10:
        print(f"{label}: {count}")
    plot_and_save_counts(field, field_counts[field], top_n=20)

# Also save a combined summary CSV (optional)
summary_rows = []
for field, counter in field_counts.items():
    for label, freq in counter.items():
        summary_rows.append({"field": field, "label": label, "frequency": freq})

if summary_rows:
    combined = pd.DataFrame(summary_rows).sort_values(["field", "frequency"], ascending=[True, False])
    combined_path = OUT_DIR / "all_fields_counts.csv"
    combined.to_csv(combined_path, index=False)
    print(f"\nSaved combined counts → {combined_path}")



dataset_creation_reason
A: 47
F: 32
D: 26
G: 21
E: 19
C: 12
I: 6
H: 2
Saved counts → raw_counts\dataset_creation_reason_counts.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(counts), y=list(labels), palette="Blues_d")


Saved plot   → raw_counts\dataset_creation_reason_top_labels.png

data_collection_method
DC13: 8
DC1: 7
DC15: 7
DC8: 7
DC5: 6
DC2: 5
DC10: 5
DC6: 5
DC4: 5
DC3: 4
Saved counts → raw_counts\data_collection_method_counts.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(counts), y=list(labels), palette="Blues_d")


Saved plot   → raw_counts\data_collection_method_top_labels.png

data_collection_purpose
Created to study real-world attack data.: 22
Created to study real-world audio transmission behavior of IoT devices.: 5
Training ML models for binary recovery.: 3
Created to study real-world vulnerabilities in web applications: 2
Created to study real-world vulnerabilities in e-commerce scams.: 2
Created to study real-world vulnerabilities in Android applications.: 2
Created to study real-world attack/fraud data: 2
Created to study real-world website fingerprinting attacks.: 2
Other purposes specified in the paper: 2
Created to study real-world vulnerabilities in access control policies.: 2
Saved counts → raw_counts\data_collection_purpose_counts.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(counts), y=list(labels), palette="Blues_d")


Saved plot   → raw_counts\data_collection_purpose_top_labels.png

data_sources
Network logs: 6
Iot Devices: 6
Access logs from real-world systems: 4
Ransomware binaries: 4
Victim telemetry: 4
Synthetic victims: 4
University dataset: 3
not mentioned: 3
Google Play: 2
Simulated statistical model: 2
Saved counts → raw_counts\data_sources_counts.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(counts), y=list(labels), palette="Blues_d")


Saved plot   → raw_counts\data_sources_top_labels.png

Saved combined counts → raw_counts\all_fields_counts.csv
