In [None]:
import arxiv
import random
import time
import csv
from datetime import datetime

In [None]:
#Physics

def search_arxiv_in_category(category, total_results):
    query = f"cat:{category}"
    search = arxiv.Search(query=query, max_results=total_results)
    results = arxiv.Client().results(search)
    return results

def write_paper_info_to_csv(file_path, paper, category, processed_papers):
    # Count the number of words in the abstract
    word_count = len(paper.summary.split())

    # Proceed only if the abstract has at least 50 words
    if word_count >= 50:
        with open(file_path, 'a', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Category', 'Title', 'Authors', 'Abstract', 'URL']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            # Write the header only if the file is new
            if csvfile.tell() == 0:
                writer.writeheader()

            paper_info = {
                'Category': category,
                'Title': paper.title,
                'Authors': ', '.join([author.name for author in paper.authors]),
                'Abstract': paper.summary,
                'URL': paper.pdf_url
            }
            # Use the paper's URL as a unique identifier
            paper_key = paper_info['URL']
            
            print(f"Processing paper with URL: {paper_key}")

            if paper_key not in processed_papers:
                processed_papers.add(paper_key)
                writer.writerow(paper_info)
    else:
        print(f"Skipped paper with insufficient abstract length: {paper.title}")

def print_paper_info(paper, category):
    print(f"Category: {category}")
    print(f"Title: {paper.title}")
    
    # Extract author names from Author objects
    authors = [author.name for author in paper.authors]
    print(f"Authors: {', '.join(authors)}")
    
    print(f"Abstract: {paper.summary}")
    print(f"URL: {paper.pdf_url}")
    print("-" * 50)

# List of Physics-related categories
physics_categories = [
    'physics:gen-ph',   # General Physics
    'astro-ph',         # Astrophysics
    'astro-ph.CO',      # Cosmology and Non-Galactic Astrophysics
    'astro-ph.EP',      # Earth and Planetary Astrophysics
    'astro-ph.GA',      # Astrophysics of Galaxies
    'astro-ph.HE',      # High Energy Astrophysical Phenomena
    'astro-ph.IM',      # Instrumentation and Methods for Astrophysics
    'astro-ph.SR',      # Solar and Stellar Astrophysics
    'cond-mat',         # Condensed Matter
    'cond-mat.dis-nn',  # Disordered Systems and Neural Networks
    'cond-mat.mtrl-sci',# Materials Science
    'cond-mat.mes-hall',# Mesoscale and Nanoscale Physics
    'cond-mat.other',   # Other Condensed Matter
    'cond-mat.quant-gas',# Quantum Gases
    'cond-mat.soft',    # Soft Condensed Matter
    'cond-mat.stat-mech',# Statistical Mechanics
    'cond-mat.str-el',  # Strongly Correlated Electrons
    'cond-mat.supr-con',# Superconductivity
    'gr-qc',            # General Relativity and Quantum Cosmology
    'hep-ex',           # High Energy Physics - Experiment
    'hep-lat',          # High Energy Physics - Lattice
    'hep-ph',           # High Energy Physics - Phenomenology
    'hep-th',           # High Energy Physics - Theory
    'math-ph',          # Mathematical Physics
    'nlin',             # Nonlinear Sciences
    'nlin.AO',          # Adaptation and Self-Organizing Systems
    'nlin.CG',          # Cellular Automata and Lattice Gases
    'nlin.CD',          # Chaotic Dynamics
    'nlin.SI',          # Exactly Solvable and Integrable Systems
    'nucl-ex',          # Nuclear Experiment
    'nucl-th',          # Nuclear Theory
    'physics.ed-ph',    # Physics Education
    'physics.soc-ph',   # Physics and Society
    'quant-ph',         # Quantum Physics
    'physics.bio-ph'    # Biological Physics
    'physics.app-ph',    # Applied Physics
    'physics.ao-ph',     # Atmospheric and Oceanic Physics
    'physics.atom-ph',   # Atomic Physics
    'physics.atm-clus',  # Atomic and Molecular Clusters
    'physics.bio-ph',    # Biological Physics
    'physics.chem-ph',   # Chemical Physics
    'physics.class-ph',  # Classical Physics
    'physics.comp-ph',   # Computational Physics
    'physics.data-an',   # Data Analysis, Statistics and Probability
    'physics.flu-dyn',   # Fluid Dynamics
    'physics.geo-ph',    # Geophysics
    'physics.hist-ph',   # History and Philosophy of Physics
    'physics.ins-det',   # Instrumentation and Detectors
    'physics.med-ph',    # Medical Physics
    'physics.optics',    # Optics
    'physics.plasm-ph',  # Plasma Physics
    'physics.pop-ph',    # Popular Physics
    'physics.space-ph',  # Space Physics
    'quant-ph',          # Quantum Physics
    'math.MP',           # Mathematical Physics (within Mathematics)
    'cs.CE',             # Computational Engineering (within Computer Science)
    'cs.SY',             # Systems and Control (within Computer Science)
    'eess.SP',           # Signal Processing (within Electrical Engineering and Systems Science)
    'eess.IV',           # Image and Video Processing (within Electrical Engineering and Systems Science)
    'eess.AS'            # Audio and Speech Processing (within Electrical Engineering and Systems Science)
]


# Set to store processed paper information (URL)
processed_papers = set()

# Create a single CSV file for all papers
output_csv_file = 'physics_papers.csv'

# Set the target number of unique papers
target_unique_papers = 395

# Initialize the count of unique papers written
print(len(processed_papers))

while len(processed_papers) < target_unique_papers:
    category = random.choice(physics_categories)
    print(f"\nSearching in category: {category}\n")
    
    # Search for recent papers in the randomly selected category
    papers_in_category = search_arxiv_in_category(category, total_results=100)
    
    # Convert the generator to a list
    papers_list = list(papers_in_category)
    
    # Check if there are papers in the list
    if not papers_list:
        print(f"No papers found in category: {category}")
        continue

    # Print and write information about one random paper in the category to CSV
    random_paper = random.choice(papers_list)
    write_paper_info_to_csv(output_csv_file, random_paper, category, processed_papers)
    
    # Update the count of unique papers written
    print(len(processed_papers))

print(f"\nTotal unique papers written to {output_csv_file}: {len(processed_papers)}")

In [None]:
#Medicine

def search_arxiv_by_keywords(keywords, total_results):
    query = ' OR '.join(keywords)  # Constructs a query with multiple keywords
    search = arxiv.Search(query=query, max_results=total_results)
    results = arxiv.Client().results(search)
    return results

def write_paper_info_to_csv(file_path, paper, category, processed_papers):
    # Count the number of words in the abstract
    word_count = len(paper.summary.split())

    # Proceed only if the abstract has at least 50 words
    if word_count >= 50:
        with open(file_path, 'a', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Category', 'Title', 'Authors', 'Abstract', 'URL']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            # Write the header only if the file is new
            if csvfile.tell() == 0:
                writer.writeheader()

            paper_info = {
                'Category': category,
                'Title': paper.title,
                'Authors': ', '.join([author.name for author in paper.authors]),
                'Abstract': paper.summary,
                'URL': paper.pdf_url
            }
            # Use the paper's URL as a unique identifier
            paper_key = paper_info['URL']
            
            print(f"Processing paper with URL: {paper_key}")

            if paper_key not in processed_papers:
                processed_papers.add(paper_key)
                writer.writerow(paper_info)
    else:
        print(f"Skipped paper with insufficient abstract length: {paper.title}")

def print_paper_info(paper, category):
    print(f"Category: {category}")
    print(f"Title: {paper.title}")
    
    # Extract author names from Author objects
    authors = [author.name for author in paper.authors]
    print(f"Authors: {', '.join(authors)}")
    
    print(f"Abstract: {paper.summary}")
    print(f"URL: {paper.pdf_url}")
    print("-" * 50)
    
# List of Medicine-related keywords
medicine_keywords = [
    'medicine',
    'medical',
    'health',
    'clinical',
    'pharmacology',
    'disease',
    'surgery',
    'neurology',
    'oncology',
    'pathology',
    'cardiology',
    'dermatology',
    'endocrinology',
    'gastroenterology',
    'hematology',
    'hepatology',
    'immunology',
    'infectious disease',
    'nephrology',
    'obstetrics',
    'gynecology',
    'ophthalmology',
    'orthopedics',
    'otolaryngology',
    'pediatrics',
    'psychiatry',
    'pulmonology',
    'radiology',
    'rheumatology',
    'urology',
    'anesthesiology',
    'epidemiology',
    'geriatrics',
    'palliative care',
    'rehabilitation',
    'preventive medicine',
    'public health',
    'nutrition',
    'diabetes',
    'cancer',
    'hypertension',
    'stroke',
    'alzheimer',
    'parkinson',
    'asthma',
    'HIV',
    'tuberculosis',
    'malaria',
    'vaccination',
    'genetics',
    'genome',
    'biomedical',
    'bioinformatics',
    'nanomedicine',
    'robotic surgery',
    'telemedicine',
    'e-health',
    'mental health',
    'psychology',
    'neurosurgery',
    'cardiac surgery',
    'transplantation',
    'critical care',
    'emergency medicine',
    'sports medicine',
    'pain management',
    'dermatology',
    'allergology',
    'plastic surgery',
    'fertility',
    'genomics',
    'proteomics',
    'cell therapy',
    'stem cells',
    'bioengineering',
    'biotechnology',
    'medical imaging',
    'MRI',
    'CT scan',
    'ultrasound',
    'radiation therapy',
    'chemotherapy',
    'immunotherapy',
    'clinical trials',
    'patient care',
    'healthcare policy',
    'medical ethics',
    'virology',
    'bacteriology',
    'parasitology',
    'mycology',
]

# Set to store processed paper information (URL)
processed_papers = set()

# Create a single CSV file for all papers
output_csv_file = 'medicine_papers.csv'

# Set the target number of unique papers
target_unique_papers = 395

# Initialize the count of unique papers written
print(len(processed_papers))

while len(processed_papers) < target_unique_papers:
    keywords = random.sample(medicine_keywords, k=2)  # Randomly select 2 keywords
    print(f"\nSearching with keywords: {keywords}\n")
    
    # Search for recent papers using the randomly selected keywords
    papers_by_keywords = search_arxiv_by_keywords(keywords, total_results=100)
    
    # Convert the generator to a list
    papers_list = list(papers_by_keywords)
    
    # Check if there are papers in the list
    if not papers_list:
        print(f"No papers found for keywords: {keywords}")
        continue

    # Print and write information about one random paper to CSV
    random_paper = random.choice(papers_list)
    write_paper_info_to_csv(output_csv_file, random_paper, ', '.join(keywords), processed_papers)
    
    # Update the count of unique papers written
    print(len(processed_papers))

print(f"\nTotal unique papers written to {output_csv_file}: {len(processed_papers)}")


In [None]:
#Cyber-Security

def search_arxiv_by_keywords(keywords, total_results):
    query = ' OR '.join(keywords)  # Constructs a query with multiple keywords
    search = arxiv.Search(query=query, max_results=total_results)
    results = arxiv.Client().results(search)
    return results

def write_paper_info_to_csv(file_path, paper, category, processed_papers):
    # Count the number of words in the abstract
    word_count = len(paper.summary.split())

    # Proceed only if the abstract has at least 50 words
    if word_count >= 50:
        with open(file_path, 'a', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Category', 'Title', 'Authors', 'Abstract', 'URL']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            # Write the header only if the file is new
            if csvfile.tell() == 0:
                writer.writeheader()

            paper_info = {
                'Category': category,
                'Title': paper.title,
                'Authors': ', '.join([author.name for author in paper.authors]),
                'Abstract': paper.summary,
                'URL': paper.pdf_url
            }
            # Use the paper's URL as a unique identifier
            paper_key = paper_info['URL']
            
            print(f"Processing paper with URL: {paper_key}")

            if paper_key not in processed_papers:
                processed_papers.add(paper_key)
                writer.writerow(paper_info)
    else:
        print(f"Skipped paper with insufficient abstract length: {paper.title}")

def print_paper_info(paper, category):
    print(f"Category: {category}")
    print(f"Title: {paper.title}")
    
    # Extract author names from Author objects
    authors = [author.name for author in paper.authors]
    print(f"Authors: {', '.join(authors)}")
    
    print(f"Abstract: {paper.summary}")
    print(f"URL: {paper.pdf_url}")
    print("-" * 50)
    
# List of Medicine-related keywords
cybersecurity_keywords = [
    'cybersecurity',
    'information security',
    'network security',
    'computer security',
    'cyber attack',
    'malware',
    'ransomware',
    'phishing',
    'spear phishing',
    'DDoS',
    'denial of service',
    'data breach',
    'encryption',
    'cryptography',
    'firewall',
    'intrusion detection',
    'intrusion prevention',
    'security policy',
    'risk management',
    'vulnerability analysis',
    'penetration testing',
    'ethical hacking',
    'zero-day',
    'zero-day exploit',
    'cyber espionage',
    'cyber warfare',
    'social engineering',
    'identity theft',
    'security audit',
    'compliance',
    'forensics',
    'cyber forensics',
    'incident response',
    'threat intelligence',
    'endpoint security',
    'antivirus',
    'malware analysis',
    'cybercrime',
    'botnet',
    'VPN',
    'virtual private network',
    'blockchain',
    'IoT security',
    'internet of things',
    'cloud security',
    'mobile security',
    'biometric security',
    'two-factor authentication',
    'multi-factor authentication',
    'cyber law',
    'privacy',
    'data protection',
    'GDPR',
    'CCPA',
    'ISO 27001',
    'NIST framework',
    'cybersecurity awareness',
    'SIEM',
    'security information and event management',
    'artificial intelligence in security',
    'machine learning in security',
    'cybersecurity training',
    'cyber insurance',
    'APT',
    'advanced persistent threat',
    'security operations center',
    'SOC',
    'security architecture',
    'cyber resilience',
    'critical infrastructure security',
    'SCADA security',
    'industrial control systems security',
    'quantum cryptography',
    'quantum computing and security',
    'cybersecurity policy',
    'cybersecurity regulation',
    'ethical issues in cybersecurity',
    'cybersecurity and ethics',
    'cyber deterrence',
    'cybersecurity standards',
    'cybersecurity best practices',
    'cybersecurity governance',
    'cybersecurity strategy',
    'cyber threat landscape',
    'cybersecurity metrics',
    'cybersecurity frameworks',
    'cyber risk assessment',
    'cybersecurity culture',
    'cybersecurity in healthcare',
    'cybersecurity in finance',
    'cybersecurity in government',
    'cybersecurity in education',
    'cybersecurity in business',
    'cybersecurity in critical sectors',
    'cybersecurity challenges',
    'emerging cybersecurity technologies',
    'cybersecurity innovation',
    '5G security',
    'wireless security',
    'network defense',
    'cybersecurity and COVID-19',
    'remote work security',
    'cybersecurity and remote work',
    'cybersecurity in the post-COVID era',
    'cybersecurity and globalization',
    'cybersecurity in developing countries',
    'cybersecurity and international relations',
    'cybersecurity and geopolitics',
    'cybersecurity in elections',
    'cybersecurity and democracy',
    'cybersecurity legislation',
    'cybersecurity law and policy',
    'cybersecurity and human rights',
    'cybersecurity ethics and law',
]

# Set to store processed paper information (URL)
processed_papers = set()

# Create a single CSV file for all papers
output_csv_file = 'cybersecurity_papers.csv'

# Set the target number of unique papers
target_unique_papers = 395

# Initialize the count of unique papers written
print(len(processed_papers))

while len(processed_papers) < target_unique_papers:
    keywords = random.sample(cybersecurity_keywords, k=4)  # Randomly select 4 keywords
    print(f"\nSearching with keywords: {keywords}\n")
    
    # Search for recent papers using the randomly selected keywords
    papers_by_keywords = search_arxiv_by_keywords(keywords, total_results=100)
    
    # Convert the generator to a list
    papers_list = list(papers_by_keywords)
    
    # Check if there are papers in the list
    if not papers_list:
        print(f"No papers found for keywords: {keywords}")
        continue

    # Print and write information about one random paper to CSV
    random_paper = random.choice(papers_list)
    write_paper_info_to_csv(output_csv_file, random_paper, ', '.join(keywords), processed_papers)
    
    # Update the count of unique papers written
    print(len(processed_papers))

print(f"\nTotal unique papers written to {output_csv_file}: {len(processed_papers)}")
