# Data Fetching and Cleaning

In [None]:
import requests
from xml.etree import ElementTree as ET # Import the ElementTree module from the xml.etree module to parse the XML data
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
import getpass # Import the getpass module to hide the password input

In [None]:
directory = r"" # Define the output directory for saved articles
email = getpass.getpass("Enter your e-mail address: ") # Email for tracking purposes - required by PubMed guidelines.
api_key = getpass.getpass("Enter your API key: ") # YOUR PubMed API KEY

### Fetch articles metadata from PubMED

In [None]:
# Define your Search Terms, and date range 
search_terms = ["medical genetics", "radiology", "cardiology", "dermatology", "oncology", "neurology", "artificial intelligence", "machine learning"]

# Set the date range
# Recommended format: YYYY/MM/DD, otherwise there is a problem with data cleaning
# Articles are often published on the 1st day of the month, which leads to the concentration of the articles in the beginning of the month.
start_date = "2024/01/01"  # Start of time range
end_date = "2024/12/01"   # End of time range
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"  # PubMed API base URL (https://www.ncbi.nlm.nih.gov/books/NBK25500/)
# The E-Utilities are a suite of eight server-side programs that provide a stable interface into the Entrez query and database system at the National Center for Biotechnology Information (NCBI)

# List to store article data
articles_data = []

# Search terms to lowercase. Using set() to remove duplicates, it only stores unique elements.
search_terms = list(set([term.lower() for term in search_terms]))

# Fetch articles for each search term by creating a loop which iterates over each search term and outputs a dictionary with the article data.
for search_term in search_terms:
    print(f"\nSearching for articles with term: {search_term}")

    search_params = { # Parameters for the PubMed search query. These parameters are used to construct the URL for the HTTP GET request to the PubMed API.
        "db": "pubmed",  # Database to search in (PubMed)
        "term": f"{search_term}[MeSH Terms] AND ({start_date}:{end_date}[PDAT])",  # "term" is the search query. [MeSH Terms] limits search to only MeSH terms. Otherwise words will be queried from Titles, Abstracts, MeSH terms, and Other indexed metadata. PDAT = Publication Date field in PubMed.
        "retmax": 100000,  # Maximum results to retrieve per request for esearch.
        "retmode": "xml",  # Get results in XML format
        "usehistory": "y",  # Stores the search results on the PubMed server for later (and faster) retrieval (when usehistory=y)
        "api_key": api_key,  # Include API key. Allows for 10 requests per second instead of 3
        "tool": "Pubmed Scraper",  # Tool name for tracking purposes - required by PubMed guidelines
        "email": email  # Email for tracking purposes - required by PubMed guidelines
    }

    # Uses the requests library to make an HTTP GET request to the PubMed API (esearch.fcgi) to search for articles. Fcgi = Fast Common Gateway Interface. It is a standard for web servers to execute programs that generate web pages.
    search_response = requests.get(f"{base_url}/esearch.fcgi", params=search_params) #"esearch.fcgi" endpoint retrieves a list of article's UIDs matching a query from the Entrez databases. UID = Unique Identifier. 
    if search_response.status_code != 200: # Check if the request was successful. If the status code is not 200, an error occurred. 200 means the request was successful.
        print(f"Error in PubMed search: {search_response.status_code}")
        continue # Skip to the next search term if an error occurs. Using continue allows the script to skip the current search term and proceed to the next one, instead of stopping the entire execution like when using "break"

    # Parse the search results. The code extracts the WebEnv, QueryKey, and Count values from the XML response
    search_tree = ET.fromstring(search_response.content)  # Parses an XML string into an ElementTree object. Converts the XML response from PubMed into a tree structure (hierarchical data structure) that can be navigated programmatically. ET.fromstring() returns the root element of the tree. Root element is the top-level element in the tree.
    web_env = search_tree.find("WebEnv").text  # Locates the <WebEnv> tag within the tree and extracts the text content inside the tag. WebEnv is a session key that stores the search results on the PubMed server. "Find" searches for the first occurrence of a specified element in the tree
    query_key = search_tree.find("QueryKey").text  # Identifies the specific query in the session. QueryKey is a unique identifier for the search query
    total_results = int(search_tree.find("Count").text)  # Retrieves the total number of results found. Find the <Count> tag and extract the text content inside the tag. Convert the text content to an integer

    print(f"Total results found: {total_results}")

    # Step 2: Fetch article details using pagination (handling more than 100000 results).
    batch_size = 10000  # Maximum allowed results per batch for efetch.fcgi.

    for start in range(0, total_results, batch_size): # Loop over the total number of results in batches of 100000. The range() function generates a sequence of numbers from 0 to total_results with a step size of batch_size.
        fetch_params = {
            "db": "pubmed",  # Database to search in (PubMed)
            "query_key": query_key,  # Use the query key from the search results
            "WebEnv": web_env,  # Use the WebEnv from the search results
            "retmode": "xml",
            "retstart": start,  # Start index for pagination. The first batch starts at 0, the second batch at 100000, etc.
            "retmax": batch_size,  # Max number of results to retrieve per batch
            "api_key": api_key,  
            "tool": "Pubmed Scraper",  
            "email": email  #
        }

        # Uses the requests library to make an HTTP GET request to the PubMed API (efetch.fcgi) to retrieve article details. "efetch.fcgi" endpoint retrieves records in the requested format from the Entrez databases.
        fetch_response = requests.get(f"{base_url}/efetch.fcgi", params=fetch_params)  # Fetch the articles.
        if fetch_response.status_code != 200:
            print(f"Error in fetching article details from {start}: {fetch_response.status_code}")
            break  # Exit the batch loop if an error occurs. If one batch fails, all subsequent batches for the same search term will likely fail too, so code exits that batch loop to move on to the next search term and save time and resources

        # Parse article details. The code extracts the article metadata from the XML response.
        articles_tree = ET.fromstring(fetch_response.content)  # Converts the XML data into an element tree.

        for article in articles_tree.findall("PubmedArticle"): # Loop over each article in the response. Findall() returns a list of all matching elements in the tree.
            try:
                # Extract article metadata. If a field is not found, it will be set to "Unknown".
                pmid = article.find(".//PMID").text if article.find(".//PMID") is not None else "Unknown"
                title = article.find(".//ArticleTitle").text if article.find(".//ArticleTitle") is not None else "Unknown"
                journal = article.find(".//Journal/Title").text if article.find(".//Journal/Title") is not None else "Unknown"
                pub_date_tag = article.find(".//PubDate")  # Extract the publication date tag.
                pub_date = " ".join(pub_date_tag.itertext()).strip() if pub_date_tag is not None else "Unknown"

                # This is a list comprehension that loops over each author in the article and extracts the ForeName and LastName tags. If the ForeName or LastName tags are not found, it sets ForeName and/or LastName author name to "Unknown".
                authors = [
                    f"{a.find('ForeName').text if a.find('ForeName') is not None else 'Unknown'} " +
                    f"{a.find('LastName').text if a.find('LastName') is not None else 'Unknown'}"
                    for a in article.findall(".//Author")
                ] if article.findall(".//Author") else ["Unknown"] # If there are no authors at all, set to "Unknown".

                # Append a row for each author separately (long format). This is useful for analyzing authorship patterns.
                for author in authors:
                    articles_data.append({
                        "PMID": pmid,
                        "Search Term": search_term,
                        "Author": author,
                        "Publication Date": pub_date,
                        "Title": title,
                        "Journal": journal
                    })

            except Exception as e:
                print(f"Error parsing article: {e}")
                continue

        print(f"Fetched {min(start + batch_size, total_results)} of {total_results} articles for '{search_term}'...") # Print the number of articles fetched for the current batch.
        time.sleep(1)  # Prevent overwhelming the API with requests by pausing for 1 second between batches.

# Convert to Pandas DataFrame
output = pd.DataFrame(articles_data)

print("Done")



Searching for articles with term: neurology
Total results found: 86
Fetched 86 of 86 articles for 'neurology'...

Searching for articles with term: medical genetics
Total results found: 72
Fetched 72 of 72 articles for 'medical genetics'...

Searching for articles with term: cardiology
Total results found: 81
Fetched 81 of 81 articles for 'cardiology'...

Searching for articles with term: radiology
Total results found: 4445
Fetched 4445 of 4445 articles for 'radiology'...

Searching for articles with term: oncology
Total results found: 20343
Fetched 20343 of 20343 articles for 'oncology'...

Searching for articles with term: dermatology
Total results found: 76
Fetched 76 of 76 articles for 'dermatology'...

Searching for articles with term: machine learning
Total results found: 3426
Fetched 3426 of 3426 articles for 'machine learning'...

Searching for articles with term: artificial intelligence
Total results found: 5330
Fetched 5330 of 5330 articles for 'artificial intelligence'...
D

In [9]:
output

Unnamed: 0,PMID,Search Term,Author,Publication Date,Title,Journal
0,39676738,neurology,Robert Achram,2024 Dec,Enhancing Apheresis Knowledge: An Educational ...,Journal of clinical apheresis
1,39676738,neurology,E Alexander Dent,2024 Dec,Enhancing Apheresis Knowledge: An Educational ...,Journal of clinical apheresis
2,39676738,neurology,Marianne Yee,2024 Dec,Enhancing Apheresis Knowledge: An Educational ...,Journal of clinical apheresis
3,39676738,neurology,John D Roback,2024 Dec,Enhancing Apheresis Knowledge: An Educational ...,Journal of clinical apheresis
4,39676738,neurology,Jeannette Guarner,2024 Dec,Enhancing Apheresis Knowledge: An Educational ...,Journal of clinical apheresis
...,...,...,...,...,...,...
182715,35143339,artificial intelligence,Rajalakshmi Gurusamy,2024 Dec,DGSLSTM: Deep Gated Stacked Long Short-Term Me...,Big data
182716,35143339,artificial intelligence,Siva Ranjani Seenivasan,2024 Dec,DGSLSTM: Deep Gated Stacked Long Short-Term Me...,Big data
182717,35045801,artificial intelligence,Joshua Autton Carter,2024 Nov,A support vector machine algorithm can success...,Sports biomechanics
182718,35045801,artificial intelligence,Adrian Rodriguez Rivadulla,2024 Nov,A support vector machine algorithm can success...,Sports biomechanics


### Display a couple of basic information about the DataFrame

In [None]:
# Make a working copy of the output
df_output = output.copy()

# drop any rows where PMID is NaN or None
df_output= df_output.dropna(subset=['PMID'])
df_output.head()

In [None]:
# Display the number of fetched articles
print("Articles fetched:", len(df_output["PMID"].unique()))

Articles fetched: 36570


In [None]:
# sums the occurance of each search term
# Group by "Search Term", count occurrences
df_output.groupby("Search Term").count()

Unnamed: 0_level_0,PMID,Author,Publication Date,Title,Journal
Search Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
artificial intelligence,65756,65756,65756,65713,65756
cardiology,12812,12812,12812,12808,12812
dermatology,7633,7633,7633,7633,7633
machine learning,69121,69121,69121,69065,69121
medical genetics,6007,6007,6007,6006,6007
neurology,5314,5314,5314,5298,5314
oncology,82275,82275,82275,82000,82275
radiology,70103,70103,70103,69885,70103


In [None]:
# Check what columns still contain NaN values and how many
nan_counts = df_output.isna().sum()

# Filter columns with NaN values
nan_columns = nan_counts[nan_counts > 0]
print(nan_columns)

Title    613
dtype: int64


In [None]:
#Checking the data types of the columns
df_output.dtypes

PMID                object
Search Term         object
Author              object
Publication Date    object
Title               object
Journal             object
dtype: object

In [None]:
# Checking the unique dates in the "Publication Date" column
print(df_output["Publication Date"].unique())

### Data cleaning

In [None]:
# Creating copy of the dataframe fro data cleaning
df_output_clean = df_output.copy()

In [None]:
# The code below is used to clean the "Publication Date" column. It was completely generated by Copilot. It works correctyl but slowly, so it should be optimized in the future for bigger data sets. 

# Define the set of valid month abbreviations (capitalized)
valid_months = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", 
                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}

def split_pub_date(date_str):
    
    #Given a publication date string, split it into Year, Month, and Day.
    #If the second token (month) is not in valid_months (even after splitting on '-'),
    #return only the year (i.e. Month and Day as NaN).

    # First, make sure date_str is a string
    if pd.isnull(date_str):
        return pd.Series([np.nan, np.nan, np.nan], index=["Year", "Month", "Day"])
    
    # Split by whitespace
    parts = date_str.split()
    # At minimum, we expect the year
    year = parts[0]
    
    if len(parts) >= 2:
        # The "month" part may be something like "Nov", "Jul-Aug", "Jan-Feb", or even "Summer"
        month_token = parts[1]
        # If there is a hyphen, take only the first piece
        if '-' in month_token:
            candidate = month_token.split('-')[0]
        else:
            candidate = month_token
        # Normalize the candidate (capitalize first letter, rest lower case)
        candidate = candidate.capitalize()
        if candidate in valid_months:
            month = candidate
            # If there is a third part, take it as the day; otherwise, set day to NaN
            day = parts[2] if len(parts) >= 3 else np.nan
            return pd.Series([year, month, day], index=["Year", "Month", "Day"])
        else:
            # The second token is not a standard month abbreviation;
            # so we just return the year, with Month and Day as NaN.
            return pd.Series([year, np.nan, np.nan], index=["Year", "Month", "Day"])
    else:
        # Only the year is present
        return pd.Series([year, np.nan, np.nan], index=["Year", "Month", "Day"])

# Now apply this function to the "Publication Date" column:
df_output_clean[['Year', 'Month', 'Day']] = df_output_clean['Publication Date'].apply(split_pub_date)

# Convert Year and Day to numeric 
df_output_clean['Year'] = pd.to_numeric(df_output_clean['Year'], errors='coerce')
df_output_clean['Day'] = pd.to_numeric(df_output_clean['Day'], errors='coerce')

# Display the cleaned DataFrame (or at least check unique Month values)
print(df_output_clean['Month'].unique())

In [234]:
#Check if any rows were lost during the operation in comparison to the original DataFrame
df_output["PMID"].count() - df_output_clean["PMID"].count()

0

In [235]:
# Count rows where the 'Day' column is NaN
df_output_clean['Day'].isna().sum()

181760

In [236]:
# Checking how many NaN values does the separate date columns contain. Confirming, that the day column contains most NaN values.
df_output_clean[['Year', "Month", "Day"]].isna().sum()

Year          0
Month     29549
Day      181760
dtype: int64

In [237]:
# Checking if there is only one proper, unique value in the "Year" column
df_output_clean["Year"].unique()

array([2024, 2025, 2023, 2018, 2022], dtype=int64)

In [238]:
# Extract the year from the start_date and end_date
start_year = int(start_date.split('/')[0])
end_year = int(end_date.split('/')[0])

# Define the searched year values based on the input range
searched_years = list(range(start_year, end_year + 1))

# Filter the DataFrame to keep only rows where the "Year" column matches the searched values
df_output_clean = df_output_clean[df_output_clean['Year'].isin(searched_years)]

# Display the updated DataFrame
df_output_clean

Unnamed: 0,PMID,Search Term,Author,Publication Date,Title,Journal,Year,Month,Day
0,39926598,artificial intelligence,Zuhui Pu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,,
1,39926598,artificial intelligence,Tony Bowei Wang,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,,
2,39926598,artificial intelligence,Ying Lu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,,
3,39926598,artificial intelligence,Zijing Wu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,,
4,39926598,artificial intelligence,Yuxian Chen,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,,
...,...,...,...,...,...,...,...,...,...
319016,38964961,radiology,Samer Sayyed,2024 Sep-Oct,Comments on CTCA-guided selective invasive gra...,Journal of cardiovascular computed tomography,2024,Sep,
319017,38964877,radiology,Ayodipupo S Oguntade,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0
319018,38964877,radiology,Hannah Taylor,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0
319019,38964877,radiology,Ben Lacey,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0


In [239]:
# Making sure, that there is only one proper, unique value in the "Year" column
df_output_clean["Year"].unique()

array([2024, 2023], dtype=int64)

In [240]:
# Checking the unique values in the "Month" column
df_output_clean["Month"].unique()

array([nan, 'Nov', 'Dec', 'Sep', 'Oct', 'Jan', 'Aug', 'Mar', 'Jul', 'Feb',
       'Jun', 'May', 'Apr'], dtype=object)

In [None]:
# Modify the "Month" column to contain only the first month in case of values like "Jan-Feb", because the publication date is usually in the first month of the range.

df_output_clean.loc[:,'Month'] = df_output_clean['Month'].str.split('-').str[0]

# Display the updated DataFrame to check the changes
df_output_clean['Month'].unique()

array([nan, 'Nov', 'Dec', 'Sep', 'Oct', 'Jan', 'Aug', 'Mar', 'Jul', 'Feb',
       'Jun', 'May', 'Apr'], dtype=object)

In [242]:
# Not knowing the exact Month of publication, we can assume that the article was published on the first Month of the Year.
df_output_clean = df_output_clean.fillna({'Month': "Jan"})
df_output_clean

Unnamed: 0,PMID,Search Term,Author,Publication Date,Title,Journal,Year,Month,Day
0,39926598,artificial intelligence,Zuhui Pu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,
1,39926598,artificial intelligence,Tony Bowei Wang,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,
2,39926598,artificial intelligence,Ying Lu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,
3,39926598,artificial intelligence,Zijing Wu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,
4,39926598,artificial intelligence,Yuxian Chen,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,
...,...,...,...,...,...,...,...,...,...
319016,38964961,radiology,Samer Sayyed,2024 Sep-Oct,Comments on CTCA-guided selective invasive gra...,Journal of cardiovascular computed tomography,2024,Sep,
319017,38964877,radiology,Ayodipupo S Oguntade,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0
319018,38964877,radiology,Hannah Taylor,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0
319019,38964877,radiology,Ben Lacey,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0


In [243]:
# Not knowing the exact day of publication, we can assume that the article was published on the first day of the month.
df_output_clean = df_output_clean.fillna({'Day': 1})
df_output_clean

Unnamed: 0,PMID,Search Term,Author,Publication Date,Title,Journal,Year,Month,Day
0,39926598,artificial intelligence,Zuhui Pu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1.0
1,39926598,artificial intelligence,Tony Bowei Wang,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1.0
2,39926598,artificial intelligence,Ying Lu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1.0
3,39926598,artificial intelligence,Zijing Wu,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1.0
4,39926598,artificial intelligence,Yuxian Chen,2024,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1.0
...,...,...,...,...,...,...,...,...,...
319016,38964961,radiology,Samer Sayyed,2024 Sep-Oct,Comments on CTCA-guided selective invasive gra...,Journal of cardiovascular computed tomography,2024,Sep,1.0
319017,38964877,radiology,Ayodipupo S Oguntade,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0
319018,38964877,radiology,Hannah Taylor,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0
319019,38964877,radiology,Ben Lacey,2024 Jul 04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4.0


In [244]:
# Convert the 'Day' column to integer, setting errors='coerce' to handle non-numeric values
df_output_clean['Day'] = pd.to_numeric(df_output_clean['Day'], errors='coerce').astype('Int64')

# Display the updated DataFrame to check the changes
df_output_clean.dtypes

PMID                object
Search Term         object
Author              object
Publication Date    object
Title               object
Journal             object
Year                 int64
Month               object
Day                  Int64
dtype: object

In [None]:
df_output_clean["Month"].unique() # Check the unique values in the "Month" column

array(['Jan', 'Nov', 'Dec', 'Sep', 'Oct', 'Aug', 'Mar', 'Jul', 'Feb',
       'Jun', 'May', 'Apr'], dtype=object)

In [None]:
df_output_clean["Day"].unique() # Check the unique values in the "Day" column

<IntegerArray>
[ 1, 22, 26, 24, 27, 12, 28, 29, 25, 21, 30,  2, 18, 23, 15,  3, 10,  9, 17,
  7, 31, 20, 14, 19, 13, 16, 11,  8,  6,  5,  4]
Length: 31, dtype: Int64

In [None]:
# Check the distributions of articles by day of the month

import plotly.express as px
# Count the NaN values in the 'Day' column
nan_count = df_output_clean["Day"].isna().sum()

# Calculate the percentage of NaN values
total_count = len(df_output_clean["Day"])
nan_percentage = (nan_count / total_count) * 100

# Print the results
print(f"Count of NaN values in 'Day' column: {nan_count}")
print(f"Percentage of NaN values in 'Day' column: {nan_percentage:.2f}%")

px.pie(df_output_clean, names='Day', title='Distribution of Articles by Day')

# Count the values in the 'Day' column
day_counts = df_output_clean["Day"].value_counts()

pd.DataFrame(day_counts)
px.histogram(df_output_clean, x='Day', title='Distribution of Articles by Day')

In [248]:
# Checking types of data in columns
df_output_clean.dtypes

PMID                object
Search Term         object
Author              object
Publication Date    object
Title               object
Journal             object
Year                 int64
Month               object
Day                  Int64
dtype: object

In [249]:
# Make sure, that 'PMID' column is a  string type to avoid errors by creating a database
df_output_clean['PMID'] = df_output_clean['PMID'].astype(str)

# Verify the data type
print(df_output_clean.dtypes)

PMID                object
Search Term         object
Author              object
Publication Date    object
Title               object
Journal             object
Year                 int64
Month               object
Day                  Int64
dtype: object


In [250]:
# Create a new "Publication Date" column from "Year", "Month", and "Day" columns
df_output_clean['Publication Date'] = pd.to_datetime(df_output_clean[['Year', 'Month', 'Day']].astype(str).agg('-'.join, axis=1), errors='coerce')

# Display the updated DataFrame to check the changes
df_output_clean

Unnamed: 0,PMID,Search Term,Author,Publication Date,Title,Journal,Year,Month,Day
0,39926598,artificial intelligence,Zuhui Pu,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
1,39926598,artificial intelligence,Tony Bowei Wang,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
2,39926598,artificial intelligence,Ying Lu,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
3,39926598,artificial intelligence,Zijing Wu,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
4,39926598,artificial intelligence,Yuxian Chen,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
...,...,...,...,...,...,...,...,...,...
319016,38964961,radiology,Samer Sayyed,2024-09-01,Comments on CTCA-guided selective invasive gra...,Journal of cardiovascular computed tomography,2024,Sep,1
319017,38964877,radiology,Ayodipupo S Oguntade,2024-07-04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4
319018,38964877,radiology,Hannah Taylor,2024-07-04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4
319019,38964877,radiology,Ben Lacey,2024-07-04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4


In [None]:
df_output_clean['Month'].unique() # Check the unique values in the "Month" column

array(['Jan', 'Nov', 'Dec', 'Sep', 'Oct', 'Aug', 'Mar', 'Jul', 'Feb',
       'Jun', 'May', 'Apr'], dtype=object)

In [252]:
# Filter out articles with an incorrect publication date

from datetime import datetime

# Get the current date
current_date = datetime.now()

# Convert start_date and end_date to datetime
start_date_dt = pd.to_datetime(start_date, format='%Y/%m/%d', errors='coerce')
end_date_dt = pd.to_datetime(end_date, format='%Y/%m/%d', errors='coerce')

# Filter the DataFrame to keep only rows where the "Publication Date" is in the defined range and is less than or equal to the current date
df_output_clean = df_output_clean[(df_output_clean['Publication Date'] >= start_date_dt) & (df_output_clean['Publication Date'] <= end_date_dt) & (df_output_clean['Publication Date'] <= current_date)]

print("Removed rows:", len(df_output) - len(df_output_clean))

Removed rows: 55053


In [None]:
df_output_clean['Month'].unique() # Check the unique values in the "Month" column

array(['Jan', 'Nov', 'Dec', 'Sep', 'Oct', 'Aug', 'Mar', 'Jul', 'Feb',
       'Jun', 'May', 'Apr'], dtype=object)

In [None]:
df_output_clean['Publication Date'].unique() #  Check the unique values in the "Publication Date" column

<DatetimeArray>
['2024-01-01 00:00:00', '2024-11-01 00:00:00', '2024-11-22 00:00:00',
 '2024-12-01 00:00:00', '2024-11-26 00:00:00', '2024-09-01 00:00:00',
 '2024-10-01 00:00:00', '2024-01-24 00:00:00', '2024-08-27 00:00:00',
 '2024-10-12 00:00:00',
 ...
 '2023-01-09 00:00:00', '2023-05-14 00:00:00', '2023-07-19 00:00:00',
 '2023-01-21 00:00:00', '2023-02-16 00:00:00', '2023-04-12 00:00:00',
 '2023-01-11 00:00:00', '2023-01-27 00:00:00', '2024-04-29 00:00:00',
 '2024-04-26 00:00:00']
Length: 540, dtype: datetime64[ns]

In [None]:
# Fill NaN values with "Unknown Title" and convert to string
df_output_clean['Title'] = df_output_clean['Title'].fillna("Unknown Title").astype(str) 

In [256]:
# Check what columns still contain NaN values and how many
df_output_clean.isna().sum()

PMID                0
Search Term         0
Author              0
Publication Date    0
Title               0
Journal             0
Year                0
Month               0
Day                 0
dtype: int64

In [None]:
print(f"Newest Publication Date:", df_output_clean['Publication Date'].max()) # Display the newest publication date

Newest Publication Date: 2024-12-01 00:00:00


In [258]:
# Checking if the cleaned data set looks appropriate
df_output_clean

Unnamed: 0,PMID,Search Term,Author,Publication Date,Title,Journal,Year,Month,Day
0,39926598,artificial intelligence,Zuhui Pu,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
1,39926598,artificial intelligence,Tony Bowei Wang,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
2,39926598,artificial intelligence,Ying Lu,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
3,39926598,artificial intelligence,Zijing Wu,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
4,39926598,artificial intelligence,Yuxian Chen,2024-01-01,Deciphering the role of metal ion transport-re...,Frontiers in immunology,2024,Jan,1
...,...,...,...,...,...,...,...,...,...
319016,38964961,radiology,Samer Sayyed,2024-09-01,Comments on CTCA-guided selective invasive gra...,Journal of cardiovascular computed tomography,2024,Sep,1
319017,38964877,radiology,Ayodipupo S Oguntade,2024-07-04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4
319018,38964877,radiology,Hannah Taylor,2024-07-04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4
319019,38964877,radiology,Ben Lacey,2024-07-04,"Adiposity, fat-free mass and incident heart fa...",Open heart,2024,Jul,4


In [None]:
df_output_clean.dtypes # Checking the data types of the columns

PMID                        object
Search Term                 object
Author                      object
Publication Date    datetime64[ns]
Title                       object
Journal                     object
Year                         int64
Month                       object
Day                          Int64
dtype: object

In [None]:
# Display the distribution of the articles over the years and months

import plotly.express as px

# Create a new column that combines Year and Month
df_output_clean.loc[:, 'Year-Month'] = df_output_clean['Year'].astype(str) + '-' + df_output_clean['Month']

# Group by "Year-Month" and count occurrences
grouped_df = df_output_clean.groupby(['Year-Month']).size().reset_index(name='Count')

# Create a bar chart
fig = px.bar(grouped_df, x='Year-Month', y='Count', title='Distribution of PMID Count Across Months and Years')

# Show the plot
fig.show()

In [None]:
# Stop the execution of the code - prevent the code from running the next cells and saving the data to the file.
raise SystemExit("This cell is intentionally skipped.")

### Save the results to the parquet file

In [None]:
# Get the current timestamp for the filename
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Construct the full output path
filename = f"articles_{current_time}.parquet"
output_path = os.path.join(directory, filename)

# Save DataFrame to Parquet file
df_output_clean.to_parquet(output_path, index=False)

# Get the file size
file_size = os.path.getsize(output_path) / (1024 * 1024)  # Convert to MB

print(f"DataFrame saved to '{output_path}'")
print(f"File size: {file_size:.2f} MB")