### Step-1: Install required packages
This command installs the feedparser (a library for parsing RSS and Atom feeds) and beautifulsoup4 (a library for parsing HTML and XML documents) packages.

In [1]:
pip install feedparser
pip install beautifulsoup4


SyntaxError: invalid syntax (1786067004.py, line 1)

### Step 2: Import Required Libraries
These libraries are essential for making HTTP requests, parsing HTML content, and handling file operations.

In [2]:
import requests
import time
import json
import os
from bs4 import BeautifulSoup

### Step 3: Define Function to Fetch Articles from Crossref API
This function fetches articles from the Crossref API based on the specified query and total number of articles. It extracts relevant information such as title, authors, published date, citation count, and abstract from the fetched articles, stores the data in a list, and saves it to a JSON file. After fetching articles for each query, a manifest file is generated that maps each query to its corresponding JSON filename. This manifest provides a clear and easily accessible reference for later evaluation purposes

In [1]:
import requests
import time
import json
import os
from bs4 import BeautifulSoup

# Initialize a global counter for unique IDs
global_article_id = 0

def fetch_crossref_articles(query="machine learning", total=10, data_folder_path='data', max_retries=5):
    global global_article_id  # Refer to the global variable for article IDs

    base_url = "https://api.crossref.org/works"
    rows_per_request = 100
    num_requests = total // rows_per_request + (1 if total % rows_per_request > 0 else 0)
    
    articles_data = []
    retries = 0

    while not articles_data and retries < max_retries:
        for i in range(num_requests):
            params = {
                "query": query,
                "rows": rows_per_request,
                "offset": i * rows_per_request,
                "filter": "from-pub-date:2017-01-01",
            }
            response = requests.get(base_url, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data['message']['items']
                for article in articles:
                    global_article_id += 1

                    title = article.get('title', ['No Title'])[0]
                    authors = ', '.join([f"{author.get('given', '')} {author.get('family', '')}" for author in article.get('author', [])])
                    link = article.get('URL', 'No URL')
                    published = article.get('published-print') or article.get('published-online')
                    published_date = 'No Date'
                    if published:
                        date_parts = published.get('date-parts', [[0]])[0]
                        published_date = '-'.join(str(part) for part in date_parts)
                    citation_count = article.get('is-referenced-by-count', 0)
                    abstract_html = article.get('abstract', '')
                    abstract = 'No Abstract or Keywords available'
                    if abstract_html:
                        soup = BeautifulSoup(abstract_html, 'html.parser')
                        abstract = soup.get_text()

                    articles_data.append({
                        "id": global_article_id,
                        "title": title,
                        "authors": authors,
                        "published": published_date,
                        "citations": citation_count,
                        "abstract": abstract,
                        "link": link
                    })
            else:
                print(f"Failed to fetch data for query: {query}, retrying... ({retries+1}/{max_retries})")
            time.sleep(1)  # Respectful delay between retries

        retries += 1  # Increment retries count after each attempt

    if not articles_data:
        print(f"Unable to fetch articles after {max_retries} retries.")

    # Proceed with saving the articles if any are fetched
    if articles_data:
        # Create the directory if it doesn't exist
        os.makedirs(data_folder_path, exist_ok=True)
        # Define the file path for storing the data
        file_name = f'articles_data_{query.replace(" ", "_")}.json'
        file_path = os.path.join(data_folder_path, file_name)

        # Save the articles data to the JSON file
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(articles_data, file, ensure_ascii=False, indent=4)

        print(f"Total number of articles fetched and stored for '{query}': {len(articles_data)}")
        print("=" * 80)

        return file_name
    else:
        return None

# Generate a manifest mapping queries to filenames
def generate_manifest(queries, data_folder_path='data'):
    manifest = {}
    for query in queries:
        file_name = fetch_crossref_articles(query=query, total=10, data_folder_path=data_folder_path)
        manifest[query] = file_name

    manifest_path = os.path.join(data_folder_path, 'query_file_manifest.json')
    with open(manifest_path, 'w', encoding='utf-8') as file:
        json.dump(manifest, file, ensure_ascii=False, indent=4)

    print("Manifest generated successfully.")


### Step 4: Define Specific Queries and Fetch Articles for Each Query
These are the specific queries related to core machine learning (ML) and large language models (LLMs) that we want to fetch articles for. This loop iterates through each query in the queries list and calls the fetch_crossref_articles function to fetch articles for that query.

In [3]:
# Specific queries related to core ML and LLMs


queries = [
    "machine learning",
    "deep learning",
    "large language models",
    "neural networks",
    "transformer models",
    "reinforcement learning",
    "supervised learning",
    "unsupervised learning",
    "natural language processing"

    
    "Linear Regression",
    "Logistic Regression",
    "Decision Trees",
    "Random Forest",
    "Support Vector Machines (SVM)",
    "k-Nearest Neighbors (k-NN)",
    "Naive Bayes Classifier",
    "K-Means Clustering",
    "Hierarchical Clustering",
    "Apriori Algorithm",
    "PageRank",
    "Expectation-Maximization (EM)",
    "Principal Component Analysis (PCA)",
    "Convolutional Neural Networks (CNN)",
    "Recurrent Neural Networks (RNN)",
    "Long Short-Term Memory (LSTM)",
    "Generative Adversarial Networks (GANs)",
    "Word Embeddings",
    "Gaussian Mixture Models (GMM)",
    "Hidden Markov Models (HMM)",
    "Transformer Models",
    "Markov Chain Monte Carlo (MCMC)",
    "Monte Carlo Tree Search (MCTS)",
    "Alpha-Beta Pruning",
    "Q-Learning",
    "Deep Q-Networks (DQN)",
    "Policy Gradient Methods",
    "Value Iteration",
    "Bayesian Networks",
    "Markov Decision Processes (MDP)",
    "Gaussian Processes",
    "Reinforcement Learning",
    "Natural Language Processing (NLP)",
    "Computer Vision (CV)",
    "Robotics",
    "Deep Learning",
    "Machine Learning",
    "Neural Networks",
    "Artificial Intelligence",
    "Supervised Learning",
    "Unsupervised Learning",
    "Semi-Supervised Learning",
    "Transfer Learning",
    "Adversarial Learning",
    "Gradient Descent",
    "Backpropagation",
    "Stochastic Gradient Descent (SGD)",
    "Batch Normalization",
    "Dropout Regularization",
    "Activation Functions",
    "Weight Initialization",
    "Optimization Algorithms",
    "Loss Functions",
    "Overfitting and Underfitting",
    "Cross-Validation",
    "Hyperparameter Tuning",
    "Feature Engineering",
    "Dimensionality Reduction",
    "Ensemble Learning",
    "Bagging and Boosting",
    "Model Evaluation Metrics",
    "Confusion Matrix",
    "Precision and Recall",
    "F1 Score",
    "Receiver Operating Characteristic (ROC) Curve",
    "Area Under the Curve (AUC)",
    "Bias-Variance Tradeoff",
    "Data Augmentation",
    "Early Stopping",
    "Model Interpretability",
    "Gradient Boosting Machines (GBM)",
    "XGBoost",
    "LightGBM",
    "CatBoost",
    "AutoML",
    "Explainable AI (XAI)",
    "Federated Learning",
    "Self-Supervised Learning",
    "One-Shot Learning",
    "Zero-Shot Learning",
    "Transfer Learning",
    "Meta-Learning",
    "Multi-Task Learning",
    "Singular Value Decomposition (SVD)",
    "Latent Dirichlet Allocation (LDA)",
    "Boltzmann Machines",
    "Neuroevolution",
    "Neuro-Linguistic Programming (NLP)",
    "Transformer Architecture",
    "BERT (Bidirectional Encoder Representations from Transformers)",
    "GPT (Generative Pre-trained Transformer)",
    "BERTweet",
    "ALBERT (A Lite BERT)",
    "XLNet",
    "RoBERTa",
    "T5 (Text-To-Text Transfer Transformer)",
    "DistilBERT",
    "Word2Vec",
    "GloVe (Global Vectors for Word Representation)",
    "FastText",
    "Doc2Vec",
    "Attention Mechanism",
    "Transformer Encoder",
    "Transformer Decoder",
    "Attention Is All You Need",
    "Self-Attention",
    "Multi-Head Attention",
    "Bidirectional Attention",
    "Bertology",
    "Fine-Tuning",
    "Tokenization",
    "Pre-training and Fine-tuning",
    "Sequence Classification",
    "Sequence Labeling",
    "Text Generation",
    "Question Answering",
    "Summarization",
    "Translation",
    "Semantic Similarity",
    "Named Entity Recognition (NER)",
    "Sentiment Analysis",
    "Text Classification",
    "Part-of-Speech Tagging (POS Tagging)",
    "Dependency Parsing",
    "Constituency Parsing",
    "Machine Translation",
    "Image Classification",
    "Object Detection",
    "Semantic Segmentation",
    "Instance Segmentation",
    "Pose Estimation",
    "Face Recognition",
    "Generative Models",
    "Computer Vision Tasks",
    "Image Generation",
    "Image Restoration",
    "Style Transfer",
    "Super-Resolution",
    "Content-Based Image Retrieval (CBIR)",
    "Visual Question Answering (VQA)",
    "Robotics",
    "Path Planning",
    "Localization",
    "Simultaneous Localization and Mapping (SLAM)",
    "Robot Perception",
    "Reinforcement Learning",
    "Deep Reinforcement Learning",
    "Model-Free Reinforcement Learning",
    "Model-Based Reinforcement Learning",
    "Value Iteration",
    "Policy Iteration",
    "Exploration-Exploitation Tradeoff",
    "Temporal Difference Learning",
    "Deep Q-Learning",
    "Policy Gradient Methods",
    "Actor-Critic Methods",
    "Proximal Policy Optimization (PPO)",
    "Deep Deterministic Policy Gradient (DDPG)",
    "Twin Delayed DDPG (TD3)",
    "Asynchronous Advantage Actor-Critic (A3C)",
    "Soft Actor-Critic (SAC)",
    "Monte Carlo Methods",
    "Temporal Difference Learning (TD)",
    "SARSA (State-Action-Reward-State-Action)",
    "Q-Learning",
    "Deep Q-Networks (DQN)",
    "Policy Gradient Methods",
    "Proximal Policy Optimization (PPO)",
    "Trust Region Policy Optimization (TRPO)",
    "Deep Deterministic Policy Gradient (DDPG)",
    "Twin Delayed DDPG (TD3)",
    "Asynchronous Advantage Actor-Critic (A3C)",
    "Soft Actor-Critic (SAC)",
    "Meta-Learning",
    "Model-Agnostic Meta-Learning (MAML)",
    "Learning to Learn",
    "Few-Shot Learning",
    "Zero-Shot Learning",
    "One-Shot Learning",
    "Multi-Task Learning",
    "Reinforcement Learning",
    "Model-Based Reinforcement Learning",
    "Model-Free Reinforcement Learning",
    "Deep Reinforcement Learning",
    "Imitation Learning",
    "Inverse Reinforcement Learning",
    "Off-Policy Learning",
    "On-Policy Learning",
    "Exploration-Exploitation Tradeoff",
    "Temporal Difference Learning",
    "Actor-Critic Methods",
    "Policy Gradient Methods",
    "Evolution Strategies",
    "Genetic Algorithms",
    "Coevolution",
    "NEAT (NeuroEvolution of Augmenting Topologies)",
    "NSGA-II (Non-dominated Sorting Genetic Algorithm II)",
    "CMA-ES (Covariance Matrix Adaptation Evolution Strategy)",
    "Particle Swarm Optimization (PSO)",
    "Ant Colony Optimization (ACO)",
    "CV object recognition systems",
    "Robotics path planning techniques",
    "Deep learning in facial recognition",
    "Learning Algorithms for regression modeling",
    "Optimization algorithms for deep learning models",
    "LSTMs for sequence prediction",
    "Transformer models for sentiment analysis",
    "GANs for video generation",
    "RNNs for text generation",
    "Probabilistic Models for risk assessment",
    "Gaussian Models for data clustering",
    "Bayesian Models for decision making",
    "Deep Learning in autonomous systems",
    "Neural Networks for image classification",
    "Machine Learning algorithms for fraud detection",
    "NLP sentiment analysis frameworks",
    "CV object detection algorithms",
    "Robotics motion planning algorithms",
    "Deep learning in object detection",
    "Learning Algorithms for pattern recognition",
    "Optimization algorithms for neural network training",
    "LSTMs for natural language understanding",
    "Transformer models for machine translation",
    "GANs for style transfer",
    "RNNs for language modeling",
    "Probabilistic Models for statistical inference",
    "Gaussian Models for image compression",
    "Bayesian Models for spam detection",
    "Deep Learning in financial forecasting",
    "Neural Networks for fraud detection",
    "Machine Learning algorithms for recommendation systems",
    "NLP sentiment analysis frameworks",
    "CV object recognition systems",
    "Robotics motion control algorithms",
    "Deep learning in medical imaging",
    "Learning Algorithms for classification tasks",
    "Optimization algorithms for machine learning optimization",
    "LSTMs for sentiment classification",
    "Transformer models for language understanding",
    "GANs for image synthesis",
    "RNNs for sentiment classification",
    "Probabilistic Models for uncertainty estimation",
    "Gaussian Models for regression analysis",
    "Bayesian Models for decision support",
    "Deep Learning in healthcare",
    "Neural Networks in finance",
    "Machine Learning for fraud detection",
    "NLP sentiment analysis",
    "Robotics motion planning",
    "Deep learning for image recognition"]



print(f"Total number of articles loaded: {len(queries)}")

# generate_manifest(queries, data_folder_path='data')
print(global_article_id)

Total number of articles loaded: 250
24770
