In [1]:
import re

# short example for our precentage logic

exm_1_string = "The GC-content is 50%, and the pH is 7"
matches = re.findall(r"\%", exm_1_string)
if(len(matches) == 1):
    print("String contains exactly one %")


String contains exactly one %


In [14]:
from Bio import Entrez

Entrez.email = "bene6@hotmail.de"  # Always tell NCBI who you are

abstracts_with_one_perc = []

# Search PubMed for articles with "GC-content" in the title
for i in range(0, 200, 20):
    handle = Entrez.esearch(db="pubmed", term="GC content bacterial[Title]", retstart=i, retmax=20)
    record = Entrez.read(handle)
    ids = record["IdList"]
    # Process the IDs

    # Get the list of article IDs
    article_ids = record["IdList"]

    # Fetch the abstracts for the articles
    handle = Entrez.efetch(db="pubmed", id=article_ids, rettype="abstract")
    records = Entrez.read(handle)

    # Print the abstracts
    noAbstract = 0
    for record in records["PubmedArticle"]:
        try:
            temp_record = record["MedlineCitation"]["Article"]
            matches = re.findall(r"\%", temp_record["Abstract"]["AbstractText"][0])
            if(len(matches) == 1):
                abstracts_with_one_perc.append(temp_record)
            pass
        except KeyError:
            print("No abstract available")
            noAbstract += 1
    print("No abstract available for " + str(noAbstract) + " articles "  + "by total of " + str(len(article_ids)) + " articles")


No abstract available
No abstract available for 1 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available
No abstract available for 1 articles by total of 20 articles


In [8]:
records["PubmedArticle"][0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]

'Bacterial evolution is characterized by strong purifying selection as well as rapid adaptive evolution in changing environments. In this context, the genomic GC content (genomic GC) varies greatly but presents some level of phylogenetic stability, making it challenging to explain based on current hypotheses. To illuminate the evolutionary mechanisms of the genomic GC, we analyzed the base composition and functional inventory of 11,083 representative genomes. A phylogenetically constrained bimodal distribution of the genomic GC, which mainly originated from parallel divergences in the early evolution, was demonstrated. Such variation of the genomic GC can be well explained by DNA replication and repair (DRR), in which multiple pathways correlate with the genomic GC. Furthermore, the biased conservation of various stress-related genes, especially the DRR-related ones, implies distinct adaptive processes in the ancestral lineages of high- or low-GC clades which are likely induced by majo

In [19]:
records["PubmedArticle"][0]["MedlineCitation"]["Article"].keys()
records["PubmedArticle"][0]["MedlineCitation"]["Article"]["PublicationTypeList"]
# records["PubmedArticle"][0]["MedlineCitation"].keys()
# records["PubmedArticle"][0]["MedlineCitation"].keys()
# records["PubmedArticle"][0].keys()


# records["PubmedArticle"][0]["PubmedData"].keys()

[StringElement('Journal Article', attributes={'UI': 'D016428'})]

In [6]:
records["PubmedArticle"][0]["MedlineCitation"]["Article"]["ELocationID"]

[StringElement('10.1128/spectrum.02145-22', attributes={'EIdType': 'doi', 'ValidYN': 'Y'})]

In [25]:
# abstracts_with_one_perc[0]
matches = re.findall(r"\%", abstracts_with_one_perc[0]["Abstract"]["AbstractText"][0])
if(len(matches) > 1):
    print("String contains more than one percent symbol")

In [3]:
len(abstracts_with_one_perc)

31

In [5]:
abstracts_with_one_perc[0]

'Here, we report the draft genome sequence of Xylella fastidiosa strain ATCC 35873, which was obtained from the American Type Culture Collection and was originally isolated from a symptomatic American elm tree grown in Washington, DC. The ATCC 35873 genome contains 2,454,216\u2009bp and has a GC content of 51.68%.'

In [18]:
hopefull_dicts = []

# give the one <i> tag which is the nearest to the % sign

for pre_x in abstracts_with_one_perc:
    x = pre_x["Abstract"]["AbstractText"][0]
    
    # Extract the content between the tags
    result = re.search(r"<i>(.*?)</i>", x)
    if result:
        hopefully_bacterial = result.group(1)
    else:
        hopefully_bacterial = None
    # Extract the precentage value
    result = re.search(r"([\d\.]+\%)", x)
    if result:
        hopefully_percentage_for_bacterial = result.group(1)
    else:
        hopefully_percentage_for_bacterial = None
    hopefull_dict = {hopefully_bacterial: hopefully_percentage_for_bacterial}

    hopefull_dicts.append(hopefull_dict)

# show only dicts with a value and a key
# hopefull_dicts = [x for x in hopefull_dicts if list(x.keys())[0] is not None and list(x.values())[0] is not None]
hopefull_dicts
# how do we verify?





[{None: '51.68%'},
 {'Martelella soudanensis': None},
 {None: None},
 {'amoA': '2%'},
 {'Caragana microphylla, Caragana liouana': '1%'},
 {'Lentilactobacillus buchneri': '96.63%'},
 {'E. coli': '7.4%'},
 {'Haliclona (Rhizoniera)': '80%'},
 {'Oceanobacillus jordanicus': '39.09%'},
 {'Lactobacillus rhamnosus': None},
 {'Pediococcus pentosaceus': None},
 {None: '90%'},
 {'Xanthomonas oryzae': '63.7%'},
 {'Bacillus': '41.41%'},
 {'Streptomyces': '71.80%'},
 {None: '39.2%'},
 {'Methylocystis': None},
 {'Paenibacillus': '46%'},
 {'Enterococcus faecalis': None},
 {'Enterobacter roggenkampii': '56.16%'},
 {None: None},
 {None: '39%'},
 {None: '0.9%'},
 {None: '27.6%'},
 {'Pseudarthrobacter phenanthrenivorans': '65.30%'},
 {'Micrococcaceae': None},
 {None: '64%'},
 {None: '60%'},
 {None: None},
 {'Methanococcus maripaludis': '94%'},
 {None: '59.2%'}]