In [2]:
import csv
import http.client
import time
from Bio import Entrez

# Enter your email address (required by NCBI)
Entrez.email = "ddimopoulos@aegean.gr"

# Define your search terms
keyword_list_1 = ['machine learning', 'artificial intelligence', 'deep learning', 'reinforcement learning', 'neural network']
keyword_list_2 = ['venous thrombosis', 'venous thromboembolism', 'pulmonary embolism', 'deep vein thrombosis']

# Define the publication years
start_year = "1980"
end_year = "2023"

# Define the number of records to retrieve per batch (page)
batch_size = 100

# Define the number of retries for each PMID
num_retries = 3

# Define the delay between API requests (in seconds)
delay = 1

# Create a list to store the paper details
paper_details = []

# Search and retrieve paper details for each combination
for keyword_1 in keyword_list_1:
    for keyword_2 in keyword_list_2:
        # Combine the two keywords with AND to retrieve papers that match both
        query = f'{keyword_1} AND {keyword_2}'

        # Append the publication years to the query
        query += f' AND ({start_year}[PDAT] : {end_year}[PDAT])'

        # Search in PubMed using the query
        handle = Entrez.esearch(db="pubmed", term=query, retmax=batch_size)
        record = Entrez.read(handle)

        # Get the total number of records for the query
        total_records = int(record["Count"])

        # Calculate the number of batches (pages) required, limiting it to 30
        num_batches = min((total_records - 1) // batch_size + 1, 30)

        print(f"Total records for query '{query}': {total_records}")

        # Retrieve paper details for each batch (page)
        for batch in range(num_batches):
            start = batch * batch_size
            end = (batch + 1) * batch_size

            # Search in PubMed and retrieve the batch of records
            handle = Entrez.esearch(db="pubmed", term=query, retstart=start, retmax=batch_size, retmode="text")
            record = Entrez.read(handle)

            # Get a list of PubMed IDs (PMIDs) for the retrieved papers
            pmid_list = record["IdList"]

            print(f"Processing batch {batch+1}/{num_batches} with PMIDs: {', '.join(pmid_list)}")

            # Retrieve the paper details for each PMID
            for pmid in pmid_list:
                retries = 0
                while retries < num_retries:
                    try:
                        # Fetch the PubMed record for a specific PMID
                        record_handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
                        paper_record = record_handle.read()

                        # Extract relevant information from the paper record
                        pmid_number = pmid  # Store the PMID

                        title = ""
                        authors = []
                        year = ""
                        doi = ""
                        tags = []
                        abstract = ""

                        is_title = False
                        is_tags = False
                        is_abstract = False

                        lines = paper_record.split('\n')
                        for line in lines:
                            if line.startswith("TI  - "):
                                title = line.lstrip("TI  - ")
                                is_title = True
                            elif is_title and line.startswith(" "):
                                title += line.strip()
                            elif is_title:
                                is_title = False
                            elif line.startswith("AU  - "):
                                authors.append(line.lstrip("AU  - "))
                            elif line.startswith("DP  - "):
                                year = line.lstrip("DP  - ")
                            elif line.startswith("LID - "):
                                doi = line.lstrip("LID - ")
                            elif line.startswith("OT  - "):
                                tags.append(line.lstrip("OT  - "))
                            elif line.startswith("AB  - "):
                                abstract = line.lstrip("AB  - ")
                                is_abstract = True
                            elif is_abstract and line.startswith(" "):
                                abstract += line.strip()
                            elif is_abstract:
                                is_abstract = False

                        # Append the paper details to the list
                        paper_details.append([query, pmid_number, title, ', '.join(authors), year, doi, ', '.join(tags), abstract])

                        # Delay before the next API request
                        time.sleep(delay)

                        break  # Break the retry loop if request succeeds

                    except http.client.IncompleteRead as e:
                        # Handle the IncompleteRead error
                        print("IncompleteRead error occurred:", str(e))
                        print("Retrying request for PMID:", pmid)
                        retries += 1

# Define the CSV file path
csv_file = "results_PubMed_VerII.csv"

# Write the paper details to the CSV file
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Query", "PMID", "Title", "Authors", "Year", "DOI", "Tags", "Abstract"])  # Write the header
    writer.writerows(paper_details)  # Write the rows

print("Paper details extracted and saved to", csv_file)

Total records for query 'machine learning AND venous thrombosis AND (1980[PDAT] : 2023[PDAT])': 84
Processing batch 1/1 with PMIDs: 37370233, 37365805, 37348318, 37250843, 37089113, 37079979, 37043409, 36942630, 36912139, 36877716, 36750656, 36632097, 36596268, 36587511, 36580997, 36276886, 36274391, 36272528, 36220884, 36186967, 36169966, 36157936, 36072822, 36061353, 35932395, 35806137, 35791841, 35697739, 35648280, 35504312, 35414086, 35299099, 35272558, 35253466, 35204365, 35116072, 35055429, 35047634, 34950733, 34945749, 34934034, 34838025, 34773490, 34697635, 34581632, 34528949, 34492064, 34428931, 34344669, 34145330, 34137837, 34116215, 34107539, 34036817, 34003964, 33971352, 33928796, 33777638, 33625875, 33529319, 33431375, 33091585, 32920367, 32919186, 32780732, 32645000, 32353052, 32221349, 32110753, 31888439, 31697697, 31563130, 31562113, 31445252, 31374513, 31352086, 30727024, 29224926, 28592811, 28475051, 27885969, 26898369, 25332356, 24111448
Total records for query 'mach