# Script for downloading articles from PubMEd

In [None]:
import requests

In [None]:
import pandas as pd

In [None]:
from bs4 import BeautifulSoup
import re
from collections import Counter

In [None]:
# Function to perform a PubMed search
def search_pubmed(query, max_results=16000):
    print(f"Searching PubMed with query: {query}")
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&retmode=json"
    response = requests.get(url)
    data = response.json()
    print(f"Found {len(data['esearchresult']['idlist'])} articles")
    return data['esearchresult']['idlist']

# Function to fetch details of a PubMed article
def fetch_article_details(pmid):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json"
    response = requests.get(url)
    data = response.json()
    return data['result'][pmid]

# Function to parse the article details and extract required information
def parse_article_details(article):
    title = article['title']
    authors = ', '.join([author['name'] for author in article['authors']])
    doi = article.get('elocationid', 'N/A')
    abstract_url = f"https://pubmed.ncbi.nlm.nih.gov/{article['uid']}/"
    abstract_response = requests.get(abstract_url)
    abstract_soup = BeautifulSoup(abstract_response.text, 'html.parser')
    abstract = abstract_soup.find('div', {'class': 'abstract-content selected'}).text.strip() if abstract_soup.find('div', {'class': 'abstract-content selected'}) else 'N/A'
    return {
        'Title': title,
        'Author': authors,
        'DOI': doi,
        'Abstract': abstract
    }

# Text mining functions
def extract_research_question(abstract):
    sentences = abstract.split('.')
    keywords = ['research question', 'study aim', 'objective', 'this study', 'we investigate']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

def extract_research_gap(abstract):
    sentences = abstract.split('.')
    keywords = ['however', 'but', 'nevertheless', 'gap', 'challenge', 'unknown']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

def extract_main_findings(abstract):
    sentences = abstract.split('.')
    keywords = ['results', 'findings', 'we found', 'our study shows', 'conclusion']
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            return sentence.strip()
    return 'N/A'

# Function to determine if the data is FAIR
def is_fair_data(abstract):
    return 'Yes' if 'data available' in abstract.lower() or 'open data' in abstract.lower() else 'No'

# Function to determine the area of focus
def determine_area_of_focus(abstract):
    if 'malaria' in abstract.lower():
        return 'Malaria'
    elif 'hiv' in abstract.lower():
        return 'HIV'
    elif 'maternal health' in abstract.lower():
        return 'Maternal Health'
    else:
        return 'Other'

# Function to check if genomic data is available
def is_genomic_data_available(abstract):
    return 'Yes' if 'genomic' in abstract.lower() else 'No'

# Main function to perform the search and generate the CSV file
def main():
    query = 'health open data kenya malaria hiv maternal mental'
    pmids = search_pubmed(query)
    articles = []
    
    for pmid in pmids:
        article_details = fetch_article_details(pmid)
        parsed_details = parse_article_details(article_details)
        parsed_details['FAIR Data'] = is_fair_data(parsed_details['Abstract'])
        parsed_details['Area of Focus'] = determine_area_of_focus(parsed_details['Abstract'])
        parsed_details['Research Question'] = extract_research_question(parsed_details['Abstract'])
        parsed_details['Research Gap'] = extract_research_gap(parsed_details['Abstract'])
        parsed_details['Main Findings'] = extract_main_findings(parsed_details['Abstract'])
        parsed_details['Genomic Data Available'] = is_genomic_data_available(parsed_details['Abstract'])
        articles.append(parsed_details)
    
    df = pd.DataFrame(articles)
    csv_filename = 'health_data_kenya_pubmed_refined.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")

if __name__ == '__main__':
    main()
