### Step-1: Install required packages
This command installs the feedparser (a library for parsing RSS and Atom feeds) and beautifulsoup4 (a library for parsing HTML and XML documents) packages.

In [None]:
pip install feedparser
pip install beautifulsoup4


### Step 2: Import Required Libraries
These libraries are essential for making HTTP requests, parsing HTML content, and handling file operations.

In [5]:
import requests
import time
import json
import os
from bs4 import BeautifulSoup

### Step 3: Define Function to Fetch Articles from Crossref API
This function fetches articles from the Crossref API based on the specified query and total number of articles. It extracts relevant information such as title, authors, published date, citation count, and abstract from the fetched articles, stores the data in a list, and saves it to a JSON file. After fetching articles for each query, a manifest file is generated that maps each query to its corresponding JSON filename. This manifest provides a clear and easily accessible reference for later evaluation purposes

In [4]:
# Initialize a global counter for unique IDs
global_article_id = 0

def fetch_crossref_articles1(query, total=300, data_folder_path='data', max_retries=5):
    global global_article_id  # Refer to the global variable for article IDs

    base_url = "https://api.crossref.org/works"
    rows_per_request = 300
    num_requests = total // rows_per_request + (1 if total % rows_per_request > 0 else 0)
    
    articles_data = []
    article_ids = []  # Store IDs of fetched articles
    retries = 0

    while not articles_data and retries < max_retries:
        for i in range(num_requests):
            params = {
                "query": query,
                "rows": rows_per_request,
                "offset": i * rows_per_request,
                # Filter to include both journal articles and conference papers
                "filter": "type:journal-article,type:proceedings-article,from-pub-date:2023-01-01",
            }
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data['message']['items']
                for article in articles:
                    global_article_id += 1

                    title = article.get('title', ['No Title'])[0]
                    authors = ', '.join([f"{author.get('given', '')} {author.get('family', '')}" for author in article.get('author', [])])
                    link = article.get('URL', 'No URL')
                    published = article.get('published-print') or article.get('published-online')
                    published_date = 'No Date'
                    if published:
                        date_parts = published.get('date-parts', [[0]])[0]
                        published_date = '-'.join(str(part) for part in date_parts)
                    citation_count = article.get('is-referenced-by-count', 0)
                    abstract_html = article.get('abstract', '')
                    abstract = 'No Abstract or Keywords available'
                    if abstract_html:
                        soup = BeautifulSoup(abstract_html, 'html.parser')
                        abstract = soup.get_text()
                    keywords = article.get('keywords', [])
                    keywords_str = ', '.join(keywords)
                    articles_data.append({
                        "id": global_article_id,
                        "title": title,
                        "authors": authors,
                        "published": published_date,
                        "citations": citation_count,
                        "abstract": abstract,
                        "keywords": keywords_str,
                        "link": link
                    })
                    article_ids.append(global_article_id)
            else:
                print(f"Failed to fetch data for query: {query}, retrying... ({retries+1}/{max_retries})")
            time.sleep(1)  # Respectful delay between retries

        retries += 1  # Increment retries count after each attempt

    # Save the articles data and return article IDs as well
    if articles_data:
        # Create the directory if it doesn't exist
        os.makedirs(data_folder_path, exist_ok=True)
        # Define the file path for storing the data
        file_name = f'articles_data_{query.replace(" ", "_")}.json'
        file_path = os.path.join(data_folder_path, file_name)

        # Save the articles data to the JSON file
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(articles_data, file, ensure_ascii=False, indent=4)

        print(f"Total number of articles fetched and stored for '{query}': {len(articles_data)}")
        print("=" * 80)

        return article_ids
    else:
        return None



    global global_article_id  # Refer to the global variable for article IDs

    base_url = "https://api.crossref.org/works"
    rows_per_request = 50
    num_requests = total // rows_per_request + (1 if total % rows_per_request > 0 else 0)
    
    articles_data = []
    article_ids = []  # Store IDs of fetched articles
    retries = 0
    articles_fetched = 0  # Track the total number of articles fetched

    while articles_fetched < total and retries < max_retries:
        for i in range(num_requests):
            params = {
                "query": query,
                "rows": min(rows_per_request, total - articles_fetched),  # Adjust rows for the last request
                "offset": i * rows_per_request,
                # Filter to include both journal articles and conference papers
                "filter": "type:journal-article,type:proceedings-article,from-pub-date:2023-01-01",
            }
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data['message']['items']
                for article in articles:
                    global_article_id += 1
                    articles_fetched += 1  # Increment the count of fetched articles

                    title = article.get('title', ['No Title'])[0]
                    authors = ', '.join([f"{author.get('given', '')} {author.get('family', '')}" for author in article.get('author', [])])
                    link = article.get('URL', 'No URL')
                    published = article.get('published-print') or article.get('published-online')
                    published_date = 'No Date'
                    if published:
                        date_parts = published.get('date-parts', [[0]])[0]
                        published_date = '-'.join(str(part) for part in date_parts)
                    citation_count = article.get('is-referenced-by-count', 0)
                    abstract_html = article.get('abstract', '')
                    abstract = 'No Abstract or Keywords available'
                    if abstract_html:
                        soup = BeautifulSoup(abstract_html, 'html.parser')
                        abstract = soup.get_text()
                    keywords = article.get('keywords', [])
                    keywords_str = ', '.join(keywords)
                    articles_data.append({
                        "id": global_article_id,
                        "title": title,
                        "authors": authors,
                        "published": published_date,
                        "citations": citation_count,
                        "abstract": abstract,
                        "keywords": keywords_str,
                        "link": link
                    })
                    article_ids.append(global_article_id)
            else:
                print(f"Failed to fetch data for query: {query}, retrying... ({retries+1}/{max_retries})")
            time.sleep(1)  # Respectful delay between retries

        retries += 1  # Increment retries count after each attempt

    # Save the articles data and return article IDs as well
    if articles_data:
        # Create the directory if it doesn't exist
        os.makedirs(data_folder_path, exist_ok=True)
        # Define the file path for storing the data
        file_name = f'articles_data_{query.replace(" ", "_")}.json'
        file_path = os.path.join(data_folder_path, file_name)

        # Save the articles data to the JSON file
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(articles_data, file, ensure_ascii=False, indent=4)

        print(f"Total number of articles fetched and stored for '{query}': {len(articles_data)}")
        print("=" * 80)

        return article_ids
    else:
        return None


def generate_manifest_and_ground_truth(queries, data_folder_path='data'):
    manifest = {}
    ground_truth = {}
    for query in queries:
        article_ids = fetch_crossref_articles(query=query, total=10, data_folder_path=data_folder_path)
        if article_ids:
            manifest[query] = f'articles_data_{query.replace(" ", "_")}.json'
            ground_truth[query] = article_ids

    # Save the manifest
    manifest_path = os.path.join(data_folder_path, 'query_file_manifest.json')
    with open(manifest_path, 'w', encoding='utf-8') as file:
        json.dump(manifest, file, ensure_ascii=False, indent=4)

    # Save the ground truth
    ground_truth_path = os.path.join(data_folder_path, 'ground_truth.json')
    with open(ground_truth_path, 'w', encoding='utf-8') as file:
        json.dump(ground_truth, file, ensure_ascii=False, indent=4)

    print("Manifest and ground truth generated successfully.")


### Step 4: Define Specific Queries and Fetch Articles for Each Query
These are the specific queries related to core machine learning (ML) and large language models (LLMs) that we want to fetch articles for. This loop iterates through each query in the queries list and calls the fetch_crossref_articles function to fetch articles for that query.

In [6]:
# Specific queries related to core ML and LLMs
queries = [
    "PageRank",
    "Expectation-Maximization",
    "Principal Component Analysis",
    "Convolutional Neural Networks",
    "Recurrent Neural Networks",
    "Long Short-Term Memory",
    "Generative Adversarial Networks",
    "Few-shot Learning",
    "Self-supervised Learning",
    "BERT model applications",
    "GPT-3 and its implications",
    "AI for Climate Change",
    "Quantum Machine Learning",
    "Deep Learning in Edge Devices",
    "AI in Digital Health",
    "Ethical AI Practices",
    "Federated Learning applications",
    "Large language models",
    "Computer Vision essentials",
    "Statistical and Probabilistic Inference",
    "Convolutional Neural Networks",
    "Recurrent Neural Networks",
    "Natural Language Processing Fundamentals",
    "Support Vector Machines",
    "Attention Mechanisms in Neural Networks",
    "Generative Adversarial Networks",
    "Techniques in Reinforcement Learning",
    "Healthcare Diagnostics using Machine Learning",
    "Autonomous Vehicle Navigation Systems",
    "Predictive Analytics in Retail",
    "Applications of Graph Neural Networks",
    "Transformers in NLP"
    "Foundations of machine learning",
    "Applications of deep learning in healthcare",
    "Training large language models for efficiency",
    "Advancements in neural network architectures",
    "Transformer models for natural language understanding",
    "Reinforcement learning strategies in gaming",
    "Techniques in supervised learning for regression",
    "Clustering algorithms in unsupervised learning",
    "Trends in natural language processing for 2023",
    "Predictive modeling with linear regression",
    "Binary classification using logistic regression",
    "Decision tree complexity and pruning techniques",
    "Ensemble methods: Beyond random forests",
    "Optimization techniques for support vector machines",
    "k-Nearest Neighbors algorithm and its efficiency",
    "Naive Bayes for text classification",
    "K-Means clustering for image segmentation",
    "Applications of hierarchical clustering in genomics",
    "Association rules mining with the Apriori algorithm",
    "Google PageRank algorithm explained",
    "Expectation-Maximization for latent variable models",
    "Principal Component Analysis in dimension reduction",
    "Deep convolutional neural networks for object recognition",
    "Temporal sequence processing with recurrent neural networks",
    "Using LSTM networks for time series forecasting",
    "Generative adversarial networks for creative AI",
    "Semantic analysis with word embeddings",
    "Modeling with Gaussian mixture models",
    "Applications of hidden Markov models in bioinformatics",
    "Markov Chain Monte Carlo methods in Bayesian analysis",
    "Strategy optimization with Monte Carlo Tree Search",
    "Game tree pruning using Alpha-Beta technique",
    "Real-time Q-Learning applications",
    "Deep Q-Networks and their implementation",
    "Policy gradient methods for robotics",
    "Applying value iteration in dynamic environments",
    "Structure learning in Bayesian networks",
    "Markov decision processes in decision analysis",
    "Gaussian processes for non-linear regression",
    "Computer vision techniques for autonomous driving",
    "Deep reinforcement learning in artificial intelligence",
    "Semantic segmentation with fully convolutional networks",
    "Robotics: Combining AI and engineering",
    "Batch normalization in deep learning",
    "Dropout regularization for preventing overfitting",
    "Activation functions in neural network training",
    "Initialization techniques for neural networks",
    "Optimization algorithms for training deep models",
    "Loss functions for machine learning models",
    "Dealing with overfitting and underfitting in machine learning",
    "Cross-validation methods for model assessment",
    "Hyperparameter tuning in neural networks",
    "Feature engineering for improved model performance",
    "Techniques in dimensionality reduction",
    "Ensemble learning strategies for better predictions",
    "Bagging and boosting in ensemble methods",
    "Evaluating machine learning models using ROC curves",
    "Bias-variance tradeoff in model training",
    "Data augmentation techniques in deep learning",
    "Early stopping as a regularization technique",
    "Explainable AI for transparent decision-making",
    "Federated learning for privacy-preserving AI",
    "Self-supervised learning from unlabelled data",
    "Transfer learning for adapting pre-trained models",
    "Meta-learning: Learning to learn effectively",
    "Multi-task learning and its challenges",
    "Singular Value Decomposition in machine learning",
    "Topic modeling with Latent Dirichlet Allocation",
    "Energy-based models for structured predictions",
    "Evolutionary algorithms in neural architecture search",
    "Transformer architecture for language models",
    "BERT model for improving text understanding",
    "Generative pre-trained transformer applications",
    "Use of ALBERT model in resource-constrained environments",
    "XLNet: Rethinking pretraining in NLP",
    "RoBERTa: A robustly optimized BERT pretraining approach",
    "T5 model and its impact on NLP tasks",
    "DistilBERT: Distilling knowledge in transformers",
    "Global vectors for word representation",
    "FastText for efficient text classification",
    "Doc2Vec for document embedding",
    "Attention mechanisms in deep learning",
    "Innovations in transformer encoder designs",
    "Decoding strategies in transformer networks",
    "Self-attention mechanisms and their benefits",
    "Multi-head attention for better representation learning",
    "Bidirectional attention for context awareness",
    "Understanding and application of bertology",
    "Fine-tuning techniques for pre-trained models",
    "Tokenization strategies in text processing",
    "Combining pre-training and fine-tuning for NLP",
    "Sequence classification with neural networks",
    "Labeling sequences in NLP",
    "Generating text with neural networks",
    "Developing question answering systems with AI",
    "Automated text summarization techniques",
    "Machine translation improvements in 2023",
    "Semantic similarity measures in texts",
    "Named entity recognition with deep learning",
    "Sentiment analysis in social media monitoring",
    "Classifying texts in multiple categories",
    "Techniques in part-of-speech tagging",
    "Dependency parsing in natural language processing",
    "Parsing constituents in language processing",
    "Challenges in real-time machine translation",
    "Image classification at scale",
    "Object detection methods in crowded scenes",
    "Advanced techniques in semantic segmentation",
    "Instance segmentation in medical imaging",
    "Human pose estimation with deep learning",
    "Technologies in face recognition systems",
    "Creative applications of generative models",
    "Key tasks in computer vision",
    "AI-driven image generation techniques",
    "Restoring old photographs with AI",
    "Artistic style transfer with deep neural networks",
    "Super-resolution via deep learning",
    "Content-based image retrieval systems",
    "Visual question answering capabilities",
    "AI applications in robotics for healthcare",
    "Path planning algorithms for autonomous vehicles",
    "Localization techniques in mobile robotics",
    "Simultaneous Localization and Mapping (SLAM) technology",
    "Perception systems in robotic applications",
    "Deep learning for advanced robotics control",
    "Model-based reinforcement learning in robotics",
    "Inverse reinforcement learning for behavior prediction",
    "Strategies for off-policy reinforcement learning",
    "On-policy vs off-policy learning in RL",
    "Exploration vs exploitation in reinforcement learning",
    "Temporal difference learning and its applications",
    "Actor-critic methods for efficient policy learning",
    "Proximal Policy Optimization in complex environments",
    "Deep deterministic policy gradient techniques",
    "Twin delayed deep deterministic policy gradients",
    "Asynchronous methods in actor-critic learning",
    "Soft Actor-Critic for high-dimensional control tasks",
    "Meta-reinforcement learning for adaptive agents",
    "Evolution strategies for optimization in AI",
    "Genetic algorithms for feature selection",
    "Ant colony optimization in path-finding",
    "Particle swarm optimization for network training",
    "AI-driven risk assessment methods",
    "AI techniques in financial forecasting"
]




print(f"Total number of articles loaded: {len(queries)}")
generate_manifest_and_ground_truth(queries, data_folder_path='data')
print(global_article_id)

Total number of articles loaded: 175
Total number of articles fetched and stored for 'PageRank': 10
Total number of articles fetched and stored for 'Expectation-Maximization': 10
Total number of articles fetched and stored for 'Principal Component Analysis': 10


KeyboardInterrupt: 

### Step 5: Compiling a centralized record of all article titles and their IDs 
Here, I consolidate titles and IDs of articles from multiple JSON files into a single, sorted JSON file named Master_record.json, excluding entries from a manifest file and any pre-existing master record file

In [15]:
def extract_titles_and_ids(data_folder_path):
    # Initialize an empty dictionary to store titles and IDs
    article_data = {}

    # Iterate through all files in the data folder
    for file_name in os.listdir(data_folder_path):
        # Skip the manifest file and Master_record.json
        if file_name in ['query_file_manifest.json','ground_truth.json', 'Master_record.json']:
            continue

        # Construct the file path
        file_path = os.path.join(data_folder_path, file_name)

        # Load the JSON data from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            articles_data = json.load(file)

        # Extract titles and IDs from each article
        for article in articles_data:
            article_id = article['id']
            article_title = article['title']
            # Store the title with corresponding ID
            article_data[article_id] = article_title

    return article_data

# Specify the data folder path
data_folder_path = 'data'

# Extract titles and IDs from all JSON files in the data folder
article_data = extract_titles_and_ids(data_folder_path)

# Sort the article data by ID
sorted_article_data = dict(sorted(article_data.items()))

# Write the sorted article data to a JSON file
output_file_path = os.path.join(data_folder_path, 'Master_record.json')
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(sorted_article_data, file, ensure_ascii=False, indent=4)

print("Article titles with corresponding IDs have been saved.")


Article titles with corresponding IDs have been saved.
