In [1]:
import re

# short example for our precentage logic

exm_1_string = "The GC-content is 50%, and the pH is 7"
matches = re.findall(r"\%", exm_1_string)
if(len(matches) == 1):
    print("String contains exactly one %")


String contains exactly one %


In [4]:
from Bio import Entrez

Entrez.email = "bene6@hotmail.de"  # Always tell NCBI who you are

abstracts_with_one_perc = []

# Search PubMed for articles with "GC-content" in the title
for i in range(0, 200, 20):
    handle = Entrez.esearch(db="pubmed", term="GC content bacterial[Title]", retstart=i, retmax=20)
    record = Entrez.read(handle)
    ids = record["IdList"]
    # Process the IDs

    # Get the list of article IDs
    article_ids = record["IdList"]

    # Fetch the abstracts for the articles
    handle = Entrez.efetch(db="pubmed", id=article_ids, rettype="abstract")
    records = Entrez.read(handle)

    # Print the abstracts
    noAbstract = 0
    for record in records["PubmedArticle"]:
        try:
            temp_record = record["MedlineCitation"]["Article"]
            matches = re.findall(r"\%", temp_record["Abstract"]["AbstractText"][0])
            if(len(matches) == 1):
                abstracts_with_one_perc.append(temp_record)
            pass
        except KeyError:
            print("No abstract available")
            noAbstract += 1
    print("No abstract available for " + str(noAbstract) + " articles "  + "by total of " + str(len(article_ids)) + " articles")


No abstract available for 0 articles by total of 20 articles
No abstract available
No abstract available for 1 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles
No abstract available for 0 articles by total of 20 articles


In [5]:
records["PubmedArticle"][0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]

"Despite the successful application of LNA/2'OMe-FISH procedures for bacteria detection, there is a lack of knowledge on the properties that affect hybridization. Such information is crucial for the rational design of protocols. Hence, this work aimed to evaluate the effect of three essential factors on the LNA/2'OMe hybridization step-hybridization temperature, NaCl concentration and type and concentration of denaturant (formamide, ethylene carbonate and urea). This optimization was performed for 3 Gram-negative bacteria (Escherichia coli, Pseudomonas aeruginosa and Citrobacter freundii) and 2 Gram-positive bacteria (Enterococcus faecalis and Staphylococcus epidermidis), employing the response surface methodology and a Eubacteria probe. In general, it was observed that a high NaCl concentration is beneficial (from 2 M to 5 M), regardless of the denaturant used. Urea, formamide and ethylene carbonate are suitable denaturants for LNA/2'OMe-FISH applications; but urea provides higher flu

In [6]:
records["PubmedArticle"][0]["MedlineCitation"]["Article"].keys()
records["PubmedArticle"][0]["MedlineCitation"]["Article"]["PublicationTypeList"]
# records["PubmedArticle"][0]["MedlineCitation"].keys()
# records["PubmedArticle"][0]["MedlineCitation"].keys()
# records["PubmedArticle"][0].keys()


# records["PubmedArticle"][0]["PubmedData"].keys()

[StringElement('Journal Article', attributes={'UI': 'D016428'}), StringElement("Research Support, Non-U.S. Gov't", attributes={'UI': 'D013485'})]

In [7]:
records["PubmedArticle"][0]["MedlineCitation"]["Article"]["ELocationID"]

[StringElement('e0217689', attributes={'EIdType': 'pii', 'ValidYN': 'Y'}),
 StringElement('10.1371/journal.pone.0217689', attributes={'EIdType': 'doi', 'ValidYN': 'Y'})]

In [8]:
# abstracts_with_one_perc[0]
matches = re.findall(r"\%", abstracts_with_one_perc[0]["Abstract"]["AbstractText"][0])
if(len(matches) > 1):
    print("String contains more than one percent symbol")

In [9]:
len(abstracts_with_one_perc)

35

In [11]:
import pandas as pd
abstracts_with_one_perc[0]

pd.DataFrame([x["Abstract"]["AbstractText"] for x in abstracts_with_one_perc]).to_csv("test.csv", index=None)

In [17]:
hopefull_dicts = []

# give the one <i> tag which is the nearest to the % sign

# Add DOI to the dict
# Add Abstract to the dict
# ETC


for pre_x in abstracts_with_one_perc:
    x = pre_x["Abstract"]["AbstractText"][0]
    
    # Extract the content between the tags
    result = re.search(r"<i>(.*?)</i>", x)
    if result:
        hopefully_bacterial = result.group(1)
    else:
        hopefully_bacterial = None


    # Extract the precentage value
    result = re.search(r"([\d\.]+\%)", x)
    if result:
        hopefully_percentage_for_bacterial = result.group(1)
    else:
        hopefully_percentage_for_bacterial = None

    is_valuable_data = ""
    print("Is the data for " + str(hopefully_bacterial) + " with " + str(hopefully_percentage_for_bacterial) + " correct? (y/n/m)")
    while(is_valuable_data != "y" and is_valuable_data != "n" and is_valuable_data != "m"):
        is_valuable_data = input("Is the data for " + str(hopefully_bacterial) + " with " + str(hopefully_percentage_for_bacterial) + " correct? (y/n/m)")

    hopefull_dict = {hopefully_bacterial: hopefully_percentage_for_bacterial, "is_valuable_data": is_valuable_data}

    hopefull_dicts.append(hopefull_dict)

# show only dicts with a value and a key
# hopefull_dicts = [x for x in hopefull_dicts if list(x.keys())[0] is not None and list(x.values())[0] is not None]
hopefull_dicts
# how do we verify?





Is the data for None with 43.88% correct? (y/n/m)
Is the data for None with 56% correct? (y/n/m)
Is the data for None with 53.50% correct? (y/n/m)
Is the data for None with 51.68% correct? (y/n/m)
Is the data for Martelella soudanensis with None correct? (y/n/m)
Is the data for None with None correct? (y/n/m)
Is the data for amoA with 2% correct? (y/n/m)
Is the data for Caragana microphylla, Caragana liouana with 1% correct? (y/n/m)
Is the data for None with 96.63% correct? (y/n/m)
Is the data for E. coli with 7.4% correct? (y/n/m)
Is the data for Haliclona (Rhizoniera) with 80% correct? (y/n/m)
Is the data for Oceanobacillus jordanicus with 39.09% correct? (y/n/m)
Is the data for Lactobacillus rhamnosus with None correct? (y/n/m)
Is the data for Pediococcus pentosaceus with None correct? (y/n/m)
Is the data for None with 90% correct? (y/n/m)
Is the data for Xanthomonas oryzae with 63.7% correct? (y/n/m)
Is the data for Bacillus with 41.41% correct? (y/n/m)
Is the data for Streptomyces

[{None: '43.88%', 'is_valuable_data': 'n'},
 {None: '56%', 'is_valuable_data': 'n'},
 {None: '53.50%', 'is_valuable_data': 'n'},
 {None: '51.68%', 'is_valuable_data': 'n'},
 {'Martelella soudanensis': None, 'is_valuable_data': 'n'},
 {None: None, 'is_valuable_data': 'n'},
 {'amoA': '2%', 'is_valuable_data': 'm'},
 {'Caragana microphylla, Caragana liouana': '1%', 'is_valuable_data': 'm'},
 {None: '96.63%', 'is_valuable_data': 'n'},
 {'E. coli': '7.4%', 'is_valuable_data': 'm'},
 {'Haliclona (Rhizoniera)': '80%', 'is_valuable_data': 'm'},
 {'Oceanobacillus jordanicus': '39.09%', 'is_valuable_data': 'm'},
 {'Lactobacillus rhamnosus': None, 'is_valuable_data': 'n'},
 {'Pediococcus pentosaceus': None, 'is_valuable_data': 'n'},
 {None: '90%', 'is_valuable_data': 'n'},
 {'Xanthomonas oryzae': '63.7%', 'is_valuable_data': 'm'},
 {'Bacillus': '41.41%', 'is_valuable_data': 'm'},
 {'Streptomyces': '71.80%', 'is_valuable_data': 'm'},
 {None: '39.2%', 'is_valuable_data': 'n'},
 {'Methylocystis': No

In [19]:
# only show the ones which are valuable
[x for x in hopefull_dicts if x["is_valuable_data"] == "m"]

[{'amoA': '2%', 'is_valuable_data': 'm'},
 {'Caragana microphylla, Caragana liouana': '1%', 'is_valuable_data': 'm'},
 {'E. coli': '7.4%', 'is_valuable_data': 'm'},
 {'Haliclona (Rhizoniera)': '80%', 'is_valuable_data': 'm'},
 {'Oceanobacillus jordanicus': '39.09%', 'is_valuable_data': 'm'},
 {'Xanthomonas oryzae': '63.7%', 'is_valuable_data': 'm'},
 {'Bacillus': '41.41%', 'is_valuable_data': 'm'},
 {'Streptomyces': '71.80%', 'is_valuable_data': 'm'},
 {'Paenibacillus': '46%', 'is_valuable_data': 'm'},
 {'Enterobacter roggenkampii': '56.16%', 'is_valuable_data': 'm'},
 {'Pseudarthrobacter phenanthrenivorans': '65.30%', 'is_valuable_data': 'm'}]

In [14]:
import pandas as pd
# read csv Simulation.csv 


random_df = pd.read_csv("Simulation.csv", sep=";")

In [16]:
random_df.to_csv("Simulation_good_csv.csv", sep=",", index=False, quoting=1)