## Search in pubmed

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from collections import Counter

# Function to perform a PubMed search
def search_pubmed(query, max_results=16000):
    print(f"Searching PubMed with query: {query}")
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&retmode=json"
    response = requests.get(url)
    data = response.json()
    print(f"Found {len(data['esearchresult']['idlist'])} articles")
    return data['esearchresult']['idlist']

# Function to fetch details of a PubMed article
def fetch_article_details(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json"
    response = requests.get(url)
    data = response.json()
    return data['result'][pmid]

# Function to parse the article details and extract required information
def parse_article_details(article):
    title = article['title']
    authors = ', '.join([author['name'] for author in article['authors']])
    doi = article.get('elocationid', 'N/A')
    abstract_url = f"https://pubmed.ncbi.nlm.nih.gov/{article['uid']}/"
    abstract_response = requests.get(abstract_url)
    abstract_soup = BeautifulSoup(abstract_response.text, 'html.parser')
    abstract = abstract_soup.find('div', {'class': 'abstract-content selected'}).text.strip() if abstract_soup.find('div', {'class': 'abstract-content selected'}) else 'N/A'
    return {
        'Title': title,
        'Author': authors,
        'DOI': doi,
        'Abstract': abstract
    }

# Text mining functions
def extract_research_question(abstract):
    sentences = abstract.split('.')
    keywords = ['research question', 'study aim', 'objective', 'this study', 'we investigate']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

def extract_research_gap(abstract):
    sentences = abstract.split('.')
    keywords = ['however', 'but', 'nevertheless', 'gap', 'challenge', 'unknown']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

def extract_main_findings(abstract):
    sentences = abstract.split('.')
    keywords = ['results', 'findings', 'we found', 'our study shows', 'conclusion']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

# Function to determine if the data is FAIR
def is_fair_data(abstract):
    return 'Yes' if 'data available' in abstract.lower() or 'open data' in abstract.lower() else 'No'

# Function to determine the area of focus
def determine_area_of_focus(abstract):
    if 'malaria' in abstract.lower():
        return 'Malaria'
    elif 'hiv' in abstract.lower():
        return 'HIV'
    elif 'maternal health' in abstract.lower():
        return 'Maternal Health'
    else:
        return 'Other'

# Function to check if genomic data is available
def is_genomic_data_available(abstract):
    return 'Yes' if 'genomic' in abstract.lower() else 'No'

# Main function to perform the search and generate the CSV file
def main():
    query = 'health open data kenya malaria hiv maternal mental'
    pmids = search_pubmed(query)
    articles = []
    
    for pmid in pmids:
        article_details = fetch_article_details(pmid)
        parsed_details = parse_article_details(article_details)
        parsed_details['FAIR Data'] = is_fair_data(parsed_details['Abstract'])
        parsed_details['Area of Focus'] = determine_area_of_focus(parsed_details['Abstract'])
        parsed_details['Research Question'] = extract_research_question(parsed_details['Abstract'])
        parsed_details['Research Gap'] = extract_research_gap(parsed_details['Abstract'])
        parsed_details['Main Findings'] = extract_main_findings(parsed_details['Abstract'])
        parsed_details['Genomic Data Available'] = is_genomic_data_available(parsed_details['Abstract'])
        articles.append(parsed_details)
    
    df = pd.DataFrame(articles)
    csv_filename = 'health_data_kenya_pubmed_refined.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")

if __name__ == '__main__':
    main()


Searching PubMed with query: health open data kenya malaria hiv maternal mental
Found 0 articles
Data saved to health_data_kenya_pubmed_refined.csv


## Search in google scholar

In [2]:
import time
import pandas as pd
from scholarly import scholarly

# Function to search Google Scholar
def search_scholar(query, max_results=10000):
    print(f"Searching Google Scholar with query: {query}")
    search_query = scholarly.search_pubs(query)
    articles = []
    for i in range(max_results):
        try:
            article = next(search_query)
            articles.append(article)
            time.sleep(5)  # Adding a 5-second delay between requests
        except StopIteration:
            break
    print(f"Found {len(articles)} articles")
    return articles

# Function to parse the article details and extract required information
def parse_article_details(article):
    print(f"Parsing details for article: {article.bib['title']}")
    title = article.bib['title']
    authors = ', '.join(article.bib['author'])
    doi = article.bib.get('doi', 'N/A')
    abstract = article.bib.get('abstract', 'N/A')
    return {
        'Title': title,
        'Author': authors,
        'DOI': doi,
        'Abstract': abstract
    }

# Text mining functions
def extract_research_question(abstract):
    sentences = abstract.split('.')
    keywords = ['research question', 'study aim', 'objective', 'this study', 'we investigate']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

def extract_research_gap(abstract):
    sentences = abstract.split('.')
    keywords = ['however', 'but', 'nevertheless', 'gap', 'challenge', 'unknown']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

def extract_main_findings(abstract):
    sentences = abstract.split('.')
    keywords = ['results', 'findings', 'we found', 'our study shows', 'conclusion']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

# Function to determine if the data is FAIR
def is_fair_data(abstract):
    return 'Yes' if 'data available' in abstract.lower() or 'open data' in abstract.lower() else 'No'

# Function to determine the area of focus
def determine_area_of_focus(abstract):
    if 'malaria' in abstract.lower():
        return 'Malaria'
    elif 'hiv' in abstract.lower():
        return 'HIV'
    elif 'maternal health' in abstract.lower():
        return 'Maternal Health'
    else:
        return 'Other'

# Function to check if genomic data is available
def is_genomic_data_available(abstract):
    return 'Yes' if 'genomic' in abstract.lower() else 'No'

# Main function to perform the search and generate the CSV file
def main():
    query = 'Health AND Kenya AND (open data OR Malaria OR HIV OR Maternal health)'
    articles = search_scholar(query)
    parsed_articles = []
    
    for article in articles:
        article_details = parse_article_details(article)
        article_details['FAIR Data'] = is_fair_data(article_details['Abstract'])
        article_details['Area of Focus'] = determine_area_of_focus(article_details['Abstract'])
        article_details['Research Question'] = extract_research_question(article_details['Abstract'])
        article_details['Research Gap'] = extract_research_gap(article_details['Abstract'])
        article_details['Main Findings'] = extract_main_findings(article_details['Abstract'])
        article_details['Genomic Data Available'] = is_genomic_data_available(article_details['Abstract'])
        parsed_articles.append(article_details)
    
    df = pd.DataFrame(parsed_articles)
    csv_filename = 'health_data_kenya_google_scholar1.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")

if __name__ == '__main__':
    main()


Searching Google Scholar with query: Health AND Kenya AND (open data OR Malaria OR HIV OR Maternal health)


MaxTriesExceededException: Cannot Fetch from Google Scholar.

## Refined pubmed search

In [6]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import quote

#requests: for making HTTP requests.
#pandas: for data manipulation and analysis.
#BeautifulSoup: from the bs4 library, for parsing HTML and extracting data.
#quote: from urllib.parse, for URL encoding.

# Function to perform a PubMed search
def search_pubmed(query, max_results=50):
    print(f"Searching PubMed with query: {query}")
    encoded_query = quote(query) #Encodes the query for use in a URL
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={encoded_query}&retmax={max_results}&retmode=json"
    response = requests.get(url) #Sends a GET request to the constructed URL
    
    if response.status_code != 200:
        print("Error fetching search results:", response.status_code)
        return []
    
    data = response.json() #Parses the response JSON data into a Python dictionary.
    if 'esearchresult' not in data or 'idlist' not in data['esearchresult']: #Checks if the expected keys are present in the parsed data.
        print("No articles found.")
        return []
    
    print(f"Found {len(data['esearchresult']['idlist'])} articles")
    return data['esearchresult']['idlist'] #Returns the list of article IDs found in the search.

# Function to fetch details of a PubMed article
def fetch_article_details(pmid): #Fetches details of the given pubmed ids
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json" #Fetches detailed information about a specified article
    response = requests.get(url) #Sends a get request
    
    if response.status_code != 200:
        print(f"Error fetching article details for PMCID {pmid}:", response.status_code)
        return None
    
    data = response.json()
    return data['result'].get(pmid, None)

# Function to parse the article details and extract required information
def parse_article_details(article): #takes an article dictionary.
    title = article['title']
    authors = ', '.join([author['name'] for author in article['authors']]) if 'authors' in article else 'N/A'
    doi = article.get('elocationid', 'N/A')
    abstract_url = f"https://pubmed.ncbi.nlm.nih.gov/{article['uid']}/" #url for the article abstract
    
    abstract_response = requests.get(abstract_url) # sends a get request 
    abstract_soup = BeautifulSoup(abstract_response.text, 'html.parser') #Parses the HTML content of the abstract page
    
    abstract = abstract_soup.find('div', {'class': 'abstract-content selected'}) #Finds the div element containing the abstract text.
    abstract_text = abstract.text.strip() if abstract else 'N/A' #Extracts the abstract text and strips any extra whitespace; defaults to 'N/A' if not found.
    
    open_source = 'Yes' if abstract_soup.find('a', {'class': 'linkout-link'}) else 'No' #Checks for the presence of a link indicating open access to the article
    
    return {
        'Title': title,
        'Author': authors,
        'DOI': doi,
        'Abstract': abstract_text,
        'Open Source': open_source
    } #Returns a dictionary containing the parsed article details.

# Text mining functions to handle structured abstracts
def extract_section(abstract, section_keywords):
    sentences = abstract.split('.')
    for i, sentence in enumerate(sentences):
        if any(keyword.lower() in sentence.lower() for keyword in section_keywords):
            return '. '.join(sentences[i:i+2]).strip()
    return 'N/A'

def extract_research_question(abstract):
    keywords = ['research question', 'study aim', 'objective', 'this study', 'we investigate']
    return extract_section(abstract, keywords)

def extract_research_gap(abstract):
    keywords = ['however', 'but', 'nevertheless', 'gap', 'challenge', 'unknown']
    return extract_section(abstract, keywords)

def extract_main_findings(abstract):
    keywords = ['results', 'findings', 'we found', 'our study shows', 'conclusion']
    return extract_section(abstract, keywords)

# Function to determine if the data is FAIR
def is_fair_data(abstract):
    fair_keywords = ['data available', 'open data', 'FAIR', 'accessible data', 'interoperable data', 'reusable data']
    return 'Yes' if any(keyword.lower() in abstract.lower() for keyword in fair_keywords) else 'No'

# Function to determine the area of focus
def determine_area_of_focus(abstract):
    if 'malaria' in abstract.lower():
        return 'Malaria'
    elif 'hiv' in abstract.lower():
        return 'HIV'
    elif 'maternal health' in abstract.lower() or 'maternal' in abstract.lower():
        return 'Maternal Health'
    elif 'mental health' in abstract.lower() or 'mental' in abstract.lower():
        return 'Mental Health'
    else:
        return 'Other'

# Function to check if genomic data is available
def is_genomic_data_available(abstract):
    genomic_keywords = ['genomic', 'sequence', 'amplicons', 'Illumina', 'DNA', 'RNA', 'sequencing']
    return 'Yes' if any(keyword.lower() in abstract.lower() for keyword in genomic_keywords) else 'No'

# Main function to perform the search and generate the CSV file
def main():
    query = '("health" AND "open data" AND "Kenya") OR "malaria" OR "HIV" OR "maternal health" OR "mental health"'
    pmids = search_pubmed(query, max_results=100)  # Limit results for testing
    articles = []
    
    for pmid in pmids:
        article_details = fetch_article_details(pmid)
        if article_details:
            parsed_details = parse_article_details(article_details)
            parsed_details['FAIR Data'] = is_fair_data(parsed_details['Abstract'])
            parsed_details['Area of Focus'] = determine_area_of_focus(parsed_details['Abstract'])
            parsed_details['Research Question'] = extract_research_question(parsed_details['Abstract'])
            parsed_details['Research Gap'] = extract_research_gap(parsed_details['Abstract'])
            parsed_details['Main Findings'] = extract_main_findings(parsed_details['Abstract'])
            parsed_details['Genomic Data Available'] = is_genomic_data_available(parsed_details['Abstract'])
            articles.append(parsed_details)
    
    df = pd.DataFrame(articles)
    csv_filename = 'health_data_kenya_pubmed_refined.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")

if __name__ == '__main__':
    main()


Searching PubMed with query: ("health" AND "open data" AND "Kenya") OR "malaria" OR "HIV" OR "maternal health" OR "mental health"
Found 100 articles
Data saved to health_data_kenya_pubmed_refined.csv


## Refine google scholar searches

In [17]:
import time
import random
import pandas as pd
from scholarly import scholarly

# Function to search Google Scholar
def search_scholar(query, max_results=50):
    print(f"Searching Google Scholar with query: {query}")
    search_query = scholarly.search_pubs(query)
    articles = []
    for i in range(max_results):
        try:
            article = next(search_query)
            articles.append(article)
            time.sleep(random.uniform(5, 10))  # Adding a random delay between 5 and 10 seconds
        except StopIteration:
            break
        except Exception as e:
            print(f"Exception encountered: {e}")
            print("Waiting before retrying...")
            time.sleep(60)  # Wait for a minute before retrying
            search_query = scholarly.search_pubs(query)  # Re-initialize the search
    print(f"Found {len(articles)} articles")
    return articles

# Function to parse the article details and extract required information
def parse_article_details(article):
    print(f"Parsing details for article: {article.bib['title']}")
    title = article.bib['title']
    authors = ', '.join(article.bib['author'])
    doi = article.bib.get('doi', 'N/A')
    abstract = article.bib.get('abstract', 'N/A')
    
    open_source = 'Yes' if article.bib.get('url') and ('.pdf' in article.bib.get('url') or '.html' in article.bib.get('url')) else 'No'
    
    return {
        'Title': title,
        'Author': authors,
        'DOI': doi,
        'Abstract': abstract,
        'Open Source': open_source
    }

# Text mining functions to handle structured abstracts
def extract_section(abstract, section_keywords):
    sentences = abstract.split('.')
    for i, sentence in enumerate(sentences):
        if any(keyword.lower() in sentence.lower() for keyword in section_keywords):
            return '. '.join(sentences[i:i+2]).strip()
    return 'N/A'

def extract_research_question(abstract):
    keywords = ['research question', 'study aim', 'objective', 'this study', 'we investigate']
    return extract_section(abstract, keywords)

def extract_research_gap(abstract):
    keywords = ['however', 'but', 'nevertheless', 'gap', 'challenge', 'unknown']
    return extract_section(abstract, keywords)

def extract_main_findings(abstract):
    keywords = ['results', 'findings', 'we found', 'our study shows', 'conclusion']
    return extract_section(abstract, keywords)

# Function to determine if the data is FAIR
def is_fair_data(abstract):
    fair_keywords = ['data available', 'open data', 'FAIR', 'accessible data', 'interoperable data', 'reusable data']
    return 'Yes' if any(keyword.lower() in abstract.lower() for keyword in fair_keywords) else 'No'

# Function to determine the area of focus
def determine_area_of_focus(abstract):
    if 'malaria' in abstract.lower():
        return 'Malaria'
    elif 'hiv' in abstract.lower():
        return 'HIV'
    elif 'maternal health' in abstract.lower() or 'maternal' in abstract.lower():
        return 'Maternal Health'
    elif 'mental health' in abstract.lower() or 'mental' in abstract.lower():
        return 'Mental Health'
    else:
        return 'Other'

# Function to check if genomic data is available
def is_genomic_data_available(abstract):
    genomic_keywords = ['genomic', 'sequence', 'amplicons', 'Illumina', 'DNA', 'RNA', 'sequencing']
    return 'Yes' if any(keyword.lower() in abstract.lower() for keyword in genomic_keywords) else 'No'

# Main function to perform the search and generate the CSV file
def main():
    query = 'Health AND Kenya AND (open data OR Malaria OR HIV OR Maternal health)'
    articles = search_scholar(query)
    parsed_articles = []
    
    for article in articles:
        article_details = parse_article_details(article)
        article_details['FAIR Data'] = is_fair_data(article_details['Abstract'])
        article_details['Area of Focus'] = determine_area_of_focus(article_details['Abstract'])
        article_details['Research Question'] = extract_research_question(article_details['Abstract'])
        article_details['Research Gap'] = extract_research_gap(article_details['Abstract'])
        article_details['Main Findings'] = extract_main_findings(article_details['Abstract'])
        article_details['Genomic Data Available'] = is_genomic_data_available(article_details['Abstract'])
        parsed_articles.append(article_details)
    
    df = pd.DataFrame(parsed_articles)
    csv_filename = 'health_data_kenya_google_scholar_refined.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")

if __name__ == '__main__':
    main()


Searching Google Scholar with query: Health AND Kenya AND (open data OR Malaria OR HIV OR Maternal health)


MaxTriesExceededException: Cannot Fetch from Google Scholar.