### Step-1: Install required packages
This command installs the feedparser (a library for parsing RSS and Atom feeds) and beautifulsoup4 (a library for parsing HTML and XML documents) packages.

In [None]:
pip install feedparser
pip install beautifulsoup4


### Step 2: Import Required Libraries
These libraries are essential for making HTTP requests, parsing HTML content, and handling file operations.

In [3]:
import requests
import time
import json
import os
from bs4 import BeautifulSoup

### Step 3: Define Function to Fetch Articles from Crossref API
This function fetches articles from the Crossref API based on the specified query and total number of articles. It extracts relevant information such as title, authors, published date, citation count, and abstract from the fetched articles, stores the data in a list, and saves it to a JSON file.

In [6]:
# Initialize a global counter for unique IDs
global_article_id = 0

def fetch_crossref_articles(query="machine learning", total=10):
    global global_article_id  # Refer to the global variable for article IDs

    base_url = "https://api.crossref.org/works"
    rows_per_request = 100
    num_requests = total // rows_per_request + (1 if total % rows_per_request > 0 else 0)
    
    articles_data = []

    for i in range(num_requests):
        params = {
            "query": query,
            "rows": rows_per_request,
            "offset": i * rows_per_request,
            "filter": "from-pub-date:2017-01-01",
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            articles = data['message']['items']
            for article in articles:
                # Increment the global ID counter for each new article
                global_article_id += 1

                title = article.get('title', ['No Title'])[0]
                authors = ', '.join([f"{author.get('given', '')} {author.get('family', '')}" for author in article.get('author', [])])
                link = article.get('URL', 'No URL')
                
                published = article.get('published-print') or article.get('published-online')
                published_date = 'No Date'
                if published:
                    date_parts = published.get('date-parts', [[0]])[0]
                    published_date = '-'.join(str(part) for part in date_parts)
                
                citation_count = article.get('is-referenced-by-count', 0)
                
                abstract_html = article.get('abstract', '')
                abstract = 'No Abstract or Keywords available'
                if abstract_html:
                    soup = BeautifulSoup(abstract_html, 'html.parser')
                    abstract = soup.get_text()

                articles_data.append({
                    "id": global_article_id,
                    "title": title,
                    "authors": authors,
                    "published": published_date,
                    "citations": citation_count,
                    "abstract": abstract,
                    "link": link
                })
                
        else:
            print("Failed to fetch data for query:", query)
        time.sleep(1)

    # Define the directory and file path for storing the data
    data_folder_path = os.path.join(os.getcwd(), 'data')
    os.makedirs(data_folder_path, exist_ok=True)  # Create the directory if it doesn't exist
    file_path = os.path.join(data_folder_path, f'articles_data_{query.replace(" ", "_")}.json')

    # Save the articles data to the JSON file
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(articles_data, file, ensure_ascii=False, indent=4)

    print(f"Total number of articles fetched and stored for '{query}': {len(articles_data)}")
    print("=" * 80 )



### Step 4: Define Specific Queries and Fetch Articles for Each Query
These are the specific queries related to core machine learning (ML) and large language models (LLMs) that we want to fetch articles for. This loop iterates through each query in the queries list and calls the fetch_crossref_articles function to fetch articles for that query.

In [7]:
# Specific queries related to core ML and LLMs
queries = [
    "machine learning",
    "deep learning",
    "large language models",
    "neural networks",
    "transformer models",
    "reinforcement learning",
    "supervised learning",
    "unsupervised learning",
    "natural language processing"
]

# Iterate through the list of specific queries and fetch articles for each topic
for query in queries:
    print(f"Fetching articles for: {query}")
    fetch_crossref_articles(query=query, total=5000)

Fetching articles for: machine learning
Total number of articles fetched and stored for 'machine learning': 5000
Fetching articles for: deep learning
Total number of articles fetched and stored for 'deep learning': 5000
Fetching articles for: large language models
Total number of articles fetched and stored for 'large language models': 5000
Fetching articles for: neural networks
Total number of articles fetched and stored for 'neural networks': 5000
Fetching articles for: transformer models
Total number of articles fetched and stored for 'transformer models': 5000
Fetching articles for: reinforcement learning
Total number of articles fetched and stored for 'reinforcement learning': 5000
Fetching articles for: supervised learning
Total number of articles fetched and stored for 'supervised learning': 5000
Fetching articles for: unsupervised learning
Total number of articles fetched and stored for 'unsupervised learning': 5000
Fetching articles for: natural language processing
Total numbe