In [9]:
!pip install pdfplumber pandas pymupdf





In [8]:
import pdfplumber, os

pdf_folder = "pdfs"  # folder with 10 PDFs
all_texts = []

for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        with pdfplumber.open(os.path.join(pdf_folder, file)) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
            all_texts.append({"file": file, "text": text[:2000]})  # preview first 2000 chars

# Show preview of extracted text
all_texts[:2]


[{'file': '10.41467_2025_Article_59047.pdf',
  'text': 'Article https://doi.org/10.1038/s41467-025-59047-z\nMechanosensing antagonizes ethylene\nsignaling to promote root gravitropism\nin rice\nReceived:25October2024 Han-QingWang 1,Xing-YuZhao1,ZhongTang1,Xin-YuanHuang 1,\nPengWang 1,WenhuaZhang 2,YunhuiZhang3,ShengLuan 4&\nAccepted:7April2025\nFang-JieZhao 1\nCheckforupdates Rootgravitropismreliesongravityperceptionbytherootcapandrequires\ntightlyregulatedphytohormonesignaling.Here,weisolatearicemutantthat\ndisplaysrootcoilinginhydroponicsbutnormalgravitropicgrowthinsoil.We\nidentifyCOILINGROOTINWATER1(CRW1)encodinganETHYLENE-\nINSENSITIVE3(EIN3)-BINDINGF-BOXPROTEIN(OsEBF1)asthecausativegene\nforthemutantphenotype.WeshowthattheOsCRW1-EIN3LIKE1and2\n(OsEIL1/2)-ETHYLENERESPONSEFACTOR82(OsERF82)modulecontrolsthe\nproductionofreactiveoxygenspeciesintheroottip,subsequentlyimpacting\nrootcapstability,polarlocalizationofPIN-FORMED2(OsPIN2),symmetric\ndistributionofauxin,andultimatelygravitro

In [10]:
import re
import pandas as pd

dataset = []

for idx, doc in enumerate(all_texts, start=1):
    text = doc["text"]

    # Title = first line
    title = text.split("\n")[0][:200]

    # Try to find abstract
    abstract_match = re.search(r"Abstract(.*?)(Introduction|Methods|Materials)", text, re.S | re.I)
    abstract = abstract_match.group(1).strip() if abstract_match else ""

    # Try to find conclusion
    conclusion_match = re.search(r"(Conclusion|Summary)(.*?)(References|Acknowledgment|$)", text, re.S | re.I)
    conclusion = conclusion_match.group(2).strip() if conclusion_match else ""

    dataset.append({
        "id": idx,
        "title": title,
        "species": "",          # will fill later manually
        "environment": "",      # will fill later manually
        "effect": "",           # will fill later manually
        "abstract": abstract,
        "conclusion": conclusion,
        "source_file": doc["file"]
    })

df = pd.DataFrame(dataset)
df.to_csv("bio_experiments.csv", index=False)
df.head()


Unnamed: 0,id,title,species,environment,effect,abstract,conclusion,source_file
0,1,Article https://doi.org/10.1038/s41467-025-590...,,,,,,10.41467_2025_Article_59047.pdf
1,2,www.nature.com/scientificreports,,,,,", this study underscores the potential of Wolf...",8.41598_2023_Article_49680.pdf
2,3,Cell Structure and Function Cell Structure and...,,,,,,4.csf_49_24035.pdf
3,4,cancers,,,,,,1.cancers-12-00381.pdf
4,5,Experimental Brain Research (2025) 243:127,,,,Under conditions of weightlessness human posit...,,9.221_2025_Article_7090.pdf


In [11]:
!pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [12]:
import os
import pandas as pd
from PyPDF2 import PdfReader
import re
import spacy

# Load small English NLP model
nlp = spacy.load("en_core_web_sm")  # Make sure to install: pip install spacy && python -m spacy download en_core_web_sm

# Create or load dataset
dataset_file = "final_dataset.csv"
if os.path.exists(dataset_file):
    df = pd.read_csv(dataset_file)
else:
    df = pd.DataFrame(columns=[
        "id", "title", "species", "environment", "effect",
        "abstract", "conclusion", "source_file", "full_text"
    ])

# Example keywords lists for detection
species_list = ["E. coli", "S. aureus", "B. subtilis"]
environment_list = ["soil", "water", "air", "marine"]
effect_list = ["toxic", "beneficial", "neutral"]

# Function to extract full text
def extract_pdf_text(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

# Function to extract a section based on heading
def extract_section(text, section_names):
    for section in section_names:
        pattern = re.compile(rf"{section}\s*[:\-]?\s*(.*?)\n(?:\n|[A-Z][a-z])", re.IGNORECASE | re.DOTALL)
        match = pattern.search(text)
        if match:
            return match.group(1).strip()
    return None

# Folder containing PDFs
pdf_folder = "pdfs"  # replace with your folder path

# Process PDFs
for idx, file in enumerate(os.listdir(pdf_folder), start=1):
    if file.lower().endswith(".pdf"):
        path = os.path.join(pdf_folder, file)
        full_text = extract_pdf_text(path)
        full_text_clean = full_text.replace("\n", " ").strip()

        # NLP processing for keywords
        doc = nlp(full_text_clean)
        species_found = [s for s in species_list if s.lower() in full_text_clean.lower()]
        environment_found = [e for e in environment_list if e.lower() in full_text_clean.lower()]
        effect_found = [ef for ef in effect_list if ef.lower() in full_text_clean.lower()]

        # Extract abstract and conclusion
        abstract = extract_section(full_text, ["Abstract"])
        conclusion = extract_section(full_text, ["Conclusion", "Conclusions"])

        # Extract title (try first line or from filename)
        title = file.replace(".pdf", "")
        first_line = full_text.split("\n")[0].strip()
        if len(first_line) > 5:  # if first line looks like a title
            title = first_line

        # Append row
        df = pd.concat([df, pd.DataFrame([{
            "id": idx,
            "title": title,
            "species": ", ".join(species_found) if species_found else None,
            "environment": ", ".join(environment_found) if environment_found else None,
            "effect": ", ".join(effect_found) if effect_found else None,
            "abstract": abstract,
            "conclusion": conclusion,
            "source_file": file,
            "full_text": full_text_clean
        }])], ignore_index=True)

# Save final dataset
df.to_csv(dataset_file, index=False)
print(f"Dataset saved! Total PDFs processed: {len(df)}")


Dataset saved! Total PDFs processed: 10


In [13]:
print(df.head())


  id                                              title  species  \
0  1  Article https://doi.org/10.1038/s41467-025-590...  E. coli   
1  2                         8.41598_2023_Article_49680     None   
2  3                        Cell Structure and Function     None   
3  4                                            cancers     None   
4  5  RESEARCH ARTICLEExperimental Brain Research (2...     None   

        environment          effect  \
0  soil, water, air         neutral   
1        water, air         neutral   
2        water, air  toxic, neutral   
3               air           toxic   
4               air         neutral   

                                            abstract  \
0                                               None   
1                                               None   
2  s Anhydrobiosis, a phenomenon in which organis...   
3  Background: Ionizing radiation from galactic c...   
4  Under conditions of weightlessness human posit...   

                   

In [14]:
import spacy
import pandas as pd

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Load your dataset
df = pd.read_csv("final_dataset.csv")

# Example: expand species list
species_list = ["E. coli", "S. aureus", "B. subtilis", "R. varieornatus"]

# Fill missing abstracts
for i, row in df.iterrows():
    if pd.isna(row['abstract']) or row['abstract'] == "None":
        text = row['full_text']
        # Look for sentences containing the word 'abstract' or beginning part of the text
        match = re.search(r"(?:Abstract[:\-\s]*)?(.{50,500})", text, re.IGNORECASE | re.DOTALL)
        if match:
            df.at[i, 'abstract'] = match.group(1).strip()

# Fill missing species
for i, row in df.iterrows():
    if pd.isna(row['species']) or row['species'] == "None":
        text = row['full_text'].lower()
        found = [s for s in species_list if s.lower() in text]
        if found:
            df.at[i, 'species'] = ", ".join(found)

# Save updated dataset
df.to_csv("final_dataset_complete.csv", index=False)
print("Dataset updated and saved as final_dataset_complete.csv")


Dataset updated and saved as final_dataset_complete.csv


In [15]:
print(df.head())


   id                                              title          species  \
0   1  Article https://doi.org/10.1038/s41467-025-590...          E. coli   
1   2                         8.41598_2023_Article_49680              NaN   
2   3                        Cell Structure and Function  R. varieornatus   
3   4                                            cancers              NaN   
4   5  RESEARCH ARTICLEExperimental Brain Research (2...              NaN   

        environment          effect  \
0  soil, water, air         neutral   
1        water, air         neutral   
2        water, air  toxic, neutral   
3               air           toxic   
4               air         neutral   

                                            abstract  \
0  Article https://doi.org/10.1038/s41467-025-590...   
1  1 Vol.:(0123456789) Scientific Reports  |     ...   
2  s Anhydrobiosis, a phenomenon in which organis...   
3  Background: Ionizing radiation from galactic c...   
4  Under conditions of

In [16]:
from google.colab import files
files.download("final_dataset_complete.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
df_preview = df.copy()
df_preview['full_text'] = df_preview['full_text'].str[:200]  # first 200 chars
df_preview.to_csv("dataset_preview.csv", index=False)


In [18]:
from google.colab import files
files.download("dataset_preview.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
!pip install transformers torch pandas --quiet


In [21]:
import pandas as pd

# Load your preprocessed dataset
df = pd.read_csv("final_dataset_complete.csv")
df.head()


Unnamed: 0,id,title,species,environment,effect,abstract,conclusion,source_file,full_text
0,1,Article https://doi.org/10.1038/s41467-025-590...,E. coli,"soil, water, air",neutral,Article https://doi.org/10.1038/s41467-025-590...,,10.41467_2025_Article_59047.pdf,Article https://doi.org/10.1038/s41467-025-590...
1,2,8.41598_2023_Article_49680,,"water, air",neutral,1 Vol.:(0123456789) Scientific Reports | ...,", this study underscores the potential of Wolf...",8.41598_2023_Article_49680.pdf,1 Vol.:(0123456789) Scientific Reports | ...
2,3,Cell Structure and Function,R. varieornatus,"water, air","toxic, neutral","s Anhydrobiosis, a phenomenon in which organis...","In this study, we expressed R. varieornatus C...",4.csf_49_24035.pdf,Cell Structure and Function Cell Structure and...
3,4,cancers,,air,toxic,Background: Ionizing radiation from galactic c...,s: We provided a,1.cancers-12-00381.pdf,cancers Article NASA GeneLab Platform Utilized...
4,5,RESEARCH ARTICLEExperimental Brain Research (2...,,air,neutral,Under conditions of weightlessness human posit...,strengthened our view that spindles played a key,9.221_2025_Article_7090.pdf,RESEARCH ARTICLEExperimental Brain Research (2...


In [22]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove extra spaces, line breaks, and weird characters
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [23]:
from transformers import pipeline
import pandas as pd
import re

df = pd.read_csv("final_dataset_complete.csv")

summarizer = pipeline("summarization", model="google/pegasus-xsum")

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def chunk_text(text, max_tokens=200):
    words = text.split()
    for i in range(0, len(words), max_tokens):
        yield " ".join(words[i:i+max_tokens])

def summarize_full_text(text):
    text = clean_text(text)
    if len(text.split()) < 30:
        return "Text too short"
    try:
        chunks = list(chunk_text(text))
        summaries = []
        for chunk in chunks:
            result = summarizer(
                chunk,
                max_length=80,
                min_length=30,
                do_sample=False
            )
            summaries.append(result[0]['summary_text'])
        return " ".join(summaries)
    except Exception as e:
        return f"Error: {str(e)}"

df['ai_summary'] = df['full_text'].fillna("").apply(summarize_full_text)
df.to_csv("dataset_with_fulltext_summaries.csv", index=False)
print("‚úÖ Summaries created using Pegasus model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (625 > 512). Running this sequence through the model will result in indexing errors


‚úÖ Summaries created using Pegasus model


In [24]:
import os

files_to_remove = [
    "dataset_with_summaries.csv",
    "dataset_with_fulltext_summaries.csv",
    "dataset_with_detailed_summaries.csv"
]

for f in files_to_remove:
    if os.path.exists(f):
        os.remove(f)
        print(f"Deleted {f}")



Deleted dataset_with_fulltext_summaries.csv


In [25]:
!pip install sumy
!pip install nltk




In [26]:
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
import nltk

# Standard punkt tokenizer
nltk.download('punkt')

# Some versions require punkt_tab for Sumy
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [28]:
from nltk.data import find

# Check if punkt is installed
find('tokenizers/punkt')
find('tokenizers/punkt_tab')  # optional


FileSystemPathPointer('/root/nltk_data/tokenizers/punkt_tab')

In [29]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

parser = PlaintextParser.from_string("Your text here", Tokenizer("english"))


In [30]:
import pandas as pd

df = pd.read_csv("final_dataset_complete.csv")
df.head()
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import re

# Clean text function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Initialize summarizer
summarizer = LexRankSummarizer()

# Summarize full text
def summarize_full_text(text, sentence_count=10):
    text = clean_text(text)
    if len(text.split()) < 30:
        return "Text too short for summarization"
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summary = summarizer(parser.document, sentences_count=sentence_count)
    return " ".join([str(sentence) for sentence in summary])


In [31]:
# Create a new column for summaries
df['ai_summary'] = df['full_text'].apply(lambda x: summarize_full_text(x, sentence_count=10))

# Save the new CSV
df.to_csv("dataset_with_extractive_summaries.csv", index=False)
print("‚úÖ Extractive summaries created and saved!")


‚úÖ Extractive summaries created and saved!


In [32]:
pd.set_option('max_colwidth', None)  # show full summary text
df[['title', 'ai_summary']].head()


Unnamed: 0,title,ai_summary
0,Article https://doi.org/10.1038/s41467-025-59047-z,"When grown in soil, all phenotypes of the mutant, including root coiling, root cap size, OsPIN2 polar localization, auxin distribution, andgravity response, were rescued to similar levels observed in WT (Fig. The three oscrw1KOlines also showed smaller root caps and asymmetric auxin distribution in the root tips when grown in water, and thesedefective phenotypes were rescued when plants were grown in soil(Supplementary Fig. a‚ÄìdSeedlings of WT and crw1 were grown in water or soil for 2 days. Taken together, ampli Ô¨Åed OsEIL1/2-OsERF82 causes ROS accumulation in crw1 root tips likely by promoting the transcription of OsRBOHH and Class III OsPRX genes.Article https://doi.org/10.1038/s41467-025-59047-z Nature Communications | (2025) 16:3712 7 Mechanical resistance on the root tip rescued the root coiling phenotype of crw1 After identifying ampli Ô¨Åed ethylene signaling module of OsEIL1/2- OsERF82 leading to ROS overproduction as the cause of impairedgravitropism in crw1, we explored the mechanism underlying the res- cue of the root coiling phenotype in crw1 when plants were grown inwater-saturated paddy soil and other solid media (Supplementary Fig. Thus, limiting ethylene signaling in the nucleus is critical for the normal gravitropic growth of rice roots. Data are means ¬± SD; n=1 2i n( b),n=1 5‚àí19 in ( d), and n=4i n( j). and S.L. and Z.T. and F.-J.Z. and F.-J.Z."
1,8.41598_2023_Article_49680,"Although an experiment testing the effect of simulated microgravity on Wolffia globosa has been previously conducted, for the first time, we investigated the effect of multiple gravity levels on the growth and morphological traits of Wolffia globosa plants. In conclusion, this study underscores the potential of Wolffia globosa as a space crop and its adaptability to diverse gravitational conditions, contributing to the development of sustainable food production and bioregenerative life support systems for future space exploration missions. Due to the loss of gravity sensing genes in the sister species Wolffia australiana , we also hypothesised for Wolffia globosa a lack of the gravity sensing mechanisms, resulting in a reduced effect of the different gravity levels on the growth and morphological characteristics. The different images show the implementation of the experimental hardware in the different gravity treatments. (1234567890) Scientific Reports | (2024) 14:410 | https://doi.org/10.1038/s41598-023-49680-3 www.nature.com/scientificreports/Discussion The findings of this study shed light on the response of Wolffia globosa to altered gravitational conditions. The study findings revealed that the fronds of Wolffia globosa were most extended in the 1 g treatment, both on the front and back sides. The adaptability of Wolffia globosa to altered gravity conditions suggested by our data hints at the potential use of plants from the genus Wolffia in space agriculture. A. A. A."
2,Cell Structure and Function,"Key words anhydrobiosis, Tardigrades, live imaging, disordered proteins, desiccation tolerance Cell Structure and Function 49: 123‚Äì133 (2024) https://doi.org/10.1247/csf.24035 123 Cell Structure and FunctionIntroduction Anhydrobiosis, derived from ‚Äúlife without water ‚Äù in Greek, is a phenomenon where an organism loses almost all of its water and enters a state of reversible ametabolism ( Crowe et al. , 1998 ; Keilin, 1959 ). Trehalose accumulation (Crowe et al. , 1998 ; Lapinski and Tunnacliffe, 2003 ), late embryogenesis abundant (LEA) proteins ( Goyal et al. , 2005 ; Kikawada et al. , 2006 ; MacRae, 2016 ), heat shock proteins (Cornette et al. , 2010 ; King and MacRae, 2015 ), and ROS scavenging ( Gusev et al. , 2010 ; Rizzo et al. , 2010 ) have been proposed as mechanisms of desiccation tolerance ( Janis et al. , 2018 ). We also investigated the tolerance to hyperosmotic stress by the expression of CAHS proteins, and found that the expression of CAHS1, CAHS3, or CAHS8 rendered the cells tolerant to the hyperosmotic stress. 2 and 3). (C‚ÄìI) Representative images of HeLa cells overexpressing mEGFP (C), CAHS1- mEGFP (D), CAHS3-mEGFP (E), CAHS8-mEGFP (F), CAHS12-mEGFP (G), LEAM-mEGFP (H), and MAHS-mEGFP (I) proteins at the indicated time points. Although t-test was conducted for the data compared with the mEGFP data, the statistical significance was not observed.mEGFP protein, the CAHS3-mEGFP, the CAHS8-mEGFP, the CAHS12-mEGFP, the LEAM-mEGFP, the MAHS-mEGFP and the mEGFP were transfected into the HeLa cells with 293fectinTM Transfection Reagent (#12347-019, Invitrogen, Waltham, MA, USA). HeLa cells were plated on 96-well plates (7.5 √ó 102 ‚Äì 3.0 √ó 103cells/well) (#167425, Thermo Fisher Scientific), and cultured in the assay medium for 24 h. To induce gene expression, the cells were treated with doxycycline for 24 h, followed by the treatment with sorbitol solution for 24 h. For the simultaneous cell viability assay and cell death assay ( Fig. Taken together, these results suggest that the resistance to hyperosmotic stress by LEAM and MAHS expression is not cell-wide protection as observed for CAHS proteins, so we focused on the CAHS proteins for our subsequent analyses.Osmotic stress resistance by CAHS1 expression Cell Structure and Function 49: 123‚Äì133 (2024) https://doi.org/10.1247/csf.24035 129 Cell Structure and FunctionCAHS1 expression renders HeLa cells resistant to hyperosmotic stress Recent studies have elucidated that the CAHS3 protein forms fibrillar aggregates under hyperosmotic conditions in mammalian cells, thereby enhancing their tolerance to hyperosmotic stress, as observed in Drosophila S2 cells ( Tanaka et al. , 2022 ). In addition, in this study, the single-cell clone of HeLa cells stably expressing CAHS1-mEGFP indicated lower IC50 values for cell viability and EC50 values for cell death in response to sorbitol stimulation in the absence of doxycycline than parental HeLa cells ( Fig. 2 and 3)."
3,cancers,"The results from this analysis highlight novel cellular Cancers 2020 ,12, 381 3 of 23 responses that may be relevant contributors to the combined e ects of microgravity and space radiation and therefore critical drivers of health risks from space radiation. The dose data for all spaceÔ¨Çight experiments, with the exception of GLDS-111, were from instruments available on the Space Shuttle and the International Space Station. To compare the number of genes sets with the annotated gene sets, we provided heatmaps and clustering for all the datasets based on the NES for each molecular signature database collection for the analysis with the FDR <0.25 (Figure 4). To compare the number of genes sets with the annotated gene sets, we provided heatmaps and clustering for all the datasets based on the NES fo r each molecular signature database collection for the analysis with the FDR < 0.25 (Figure 4). Similar to the C5 GO analysis, we observed an increase in mitochondrial pathways as a function of dose. We observed these pathways to also be involved with our GSEA analysis as a function of dose. In our analysis on the C6 Oncogenic Signature database, we observed that PDGF and TGF were clustered together, purely on the basis of the statistics, and both were decreasing with increasing dose (cluster 3 in Figure 7C). The auto-annotated gene sets for each k-means cluster were then plotted with the NES versus the dose for associated for each GeneLab dataset (Table 1) with R-program ggplot2 (v3.2.1,). Analysis on Common Individual Genes with Fold-Change Values and Fits as a Function of Dose A total of 19 spaceÔ¨Çight studies and 3 ground studies were selected from the GeneLab database. Supplementary Materials: The following are available online at http: //www.mdpi.com /2072-6694 /12/2/381/s1: Figure S1: Principal component analysis (PCA) plots on GeneLab datasets (GLDS) using the gene set enrichment analysis (GSEA) nominal enrichment scores before (top row) and after (bottom row) auto-annotation using Cytoscape‚Äôs enrichment map software, Figure S2: t-Distributed stochastic neighbor embedding (tSNE) plots on GeneLab datasets (GLDS) using the gene set enrichment analysis (GSEA) nominal enrichment scores before (top row) and after (bottom row) auto-annotation using Cytoscape‚Äôs enrichment map software, Figure S3: t-Distributed stochastic neighbor embedding (tSNE) plots on GeneLab datasets (GLDS) using the gene set enrichment analysis (GSEA) nominal enrichment scores before (top row) and after (bottom row) auto-annotation using Cytoscape‚Äôs enrichment map software, Figure S4: Analysis of all GeneLab datasets utilized for direct comparisons with Gene Ontology (GO) GSEA analysis with false discovery rate (FDR) <0.05 for three datasets, Figure S5: Analysis of all GeneLab datasets utilized for direct comparisons with Gene Ontology (GO) GSEA analysis with FDR < 0.05 for three datasets for cluster 2, Figure S6: Analysis of all GeneLab Datasets utilized for direct comparisons with Gene Ontology (GO) GSEA analysis with FDR <0.05 for three datasets for cluster 3, Figure S7: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S4), Figure S8: Scatter plots comparing the normalized Cancers 2020 ,12, 381 18 of 23 enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S5), Figure S9: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S5), Figure S10: Analysis of all GeneLab Datasets utilized for direct comparisons with C2 curated gene sets GSEA analysis with FDR <0.05 for three datasets, Figure S11: Analysis of all GeneLab datasets utilized for direct comparisons with C2-curated gene set GSEA analysis with FDR <0.05 for three datasets for cluster 1, Figure S12: Analysis of all GeneLab datasets utilized for direct comparisons with C2-curated gene set GSEA analysis with FDR <0.05 for three datasets for cluster 2, Figure S13: Analysis of all GeneLab datasets utilized for direct comparisons with C2-curated gene set GSEA analysis with FDR <0.05 for three datasets for cluster 3, Figure S14: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S10), Figure S15: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S11), Figure S16: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S12), Figure S17: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S13), Figure S18: Analysis of all GeneLab datasets utilized for direct comparisons with C6 Oncogenic Signature GSEA analysis with FDR <0.05 for three datasets, Figure S19: Analysis of all GeneLab datasets utilized for direct comparisons with C6 Oncogenic Signature GSEA analysis with FDR < 0.05 for three datasets for cluster 1, Figure S20: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S18), Figure S21: Scatter plots comparing the normalized enrichment scores (NES) to dose in mGy and Ô¨Åts with a generalized additive model (GAM) on each cluster (represented in Figure S19), Table S1: Doses and dose rates for ground-based exposures at the NASA Space Radiation Laboratory (NSRL) at Brookhaven National Laboratory."
4,RESEARCH ARTICLEExperimental Brain Research (2025) 243:127,"It is proposed that the errors in matching and pointing are a consequence of the force of gravity acting at the elbow joint to alter the position signal coming from muscle and joint receptors. A feature of the errors in pointing was that while increases in errors in HG and falls in MG were smaller relative to the NG value, compared with two-arm matching, all pointing errors lay further in the direction of extension of the arm, including the value for NG. For pointing and matching the trend of an increase in errors during hypergravity and a decrease during microgravity, relative to the normal gravity Data analysis For two-arm matching, the angles of the reference and indi - cator arms were recorded. During flight, data of one participant is missing for repositioning N = 12 Matching Pointing Repositioning Pre-flight 0.97 (4.00) 5.95 (5.64) -0.19 (3.96) Flight Hypergravity 1 3.63 (2.78) 11.11 (4.72) -0.78 (4.81) Hypergravity 2 3.33 (4.23) 11.32 (4.25) -0.22 (4.67) Hypergravity (Comb.) The convention was used that positive values were assigned to errors in the direction of elbow extension, negative values to errors in the direction of flexion 1 3Page 7 of 12 127 Experimental Brain Research (2025) 243:127repositioning, the data suggested that position sense values were unresponsive to changes in gravity. Our observations suggest that of the three methods used in the present study, with the method of repositioning, it is not possible to reveal any disturbance of position sense by gravity and that if gravity effects were to be studied further, the preferred methods to use would have to be matching or pointing. * p <.05; ** p <.01 1 3127 Page 8 of 12 Experimental Brain Research (2025) 243:127by Bringoux et al. ( 2012 ) that during parabolic flight reach - ing errors were made with changes in gravity. If, as a result of an increase in gravity, the position signal increases, this would be expressed in both arms, the reference arm sitting at 60¬∞ and the indicator moved by the participant. It suggested that in normal gravity there was an offset, in the direction of arm extension, in the measured values of point - ing errors. Certainly, the instructions to the participants were always to align the position of one arm with that of the other arm and no reference was made to gravity."


In [33]:
df.to_json("dataset.json", orient="records")


In [34]:
import pandas as pd

df = pd.read_csv("dataset_with_extractive_summaries.csv")

# Show first 5 rows
print(df.head())

# Show full summaries without truncation
pd.set_option('max_colwidth', None)
print(df[['title', 'ai_summary']].head())


   id                                                       title  \
0   1          Article https://doi.org/10.1038/s41467-025-59047-z   
1   2                                  8.41598_2023_Article_49680   
2   3                                 Cell Structure and Function   
3   4                                                     cancers   
4   5  RESEARCH ARTICLEExperimental Brain Research (2025) 243:127   

           species       environment          effect  \
0          E. coli  soil, water, air         neutral   
1              NaN        water, air         neutral   
2  R. varieornatus        water, air  toxic, neutral   
3              NaN               air           toxic   
4              NaN               air         neutral   

                                                                                                                                                                                                                                                        

In [35]:
!pip install evaluate rouge-score

import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")

predictions = ["AI generated summary here"]
references = ["Ground truth abstract or human summary"]

# Compute ROUGE
results = rouge.compute(predictions=predictions, references=references)
print(results)



Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9ddec0f20629d6f84fd0dce6d64779f4065cd215e5745f130bf45d67cdc5441b
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, evaluate
Successfully installed evaluate-0.4.6 rouge-score-0.1.2


Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.2), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.2), 'rougeLsum': np.float64(0.2)}


In [38]:
# üìå Install required libraries (run once in Colab)
!pip install sumy

# ------------------------------
# Imports
# ------------------------------
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"

# ------------------------------
# Explore Mode (short, human-like)
# ------------------------------
def generate_explore_summary(text, sentence_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    summarizer = LsaSummarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summary = summarizer(parser.document, sentence_count)
    return " ".join([str(sentence) for sentence in summary])

# ------------------------------
# Research Mode (longer, technical)
# ------------------------------
def generate_research_summary(text, sentence_count=8):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    summarizer = LsaSummarizer(Stemmer(LANGUAGE))
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summary = summarizer(parser.document, sentence_count)
    return " ".join([str(sentence) for sentence in summary])

# ------------------------------
# Example Usage
# ------------------------------
sample_text = """
Artificial Intelligence (AI) is transforming healthcare through advanced diagnostic tools,
predictive analytics, and personalized treatment recommendations. Recent research explores
the application of deep learning in radiology, cardiology, and oncology. However, challenges
remain in data privacy, interpretability, and clinical validation. Future studies must address
ethical concerns while improving accuracy and scalability across diverse populations.
"""

print("üîπ Explore Mode Summary:")
print(generate_explore_summary(sample_text))

print("\nüîπ Research Mode Summary:")
print(generate_research_summary(sample_text))


üîπ Explore Mode Summary:
Artificial Intelligence (AI) is transforming healthcare through advanced diagnostic tools, predictive analytics, and personalized treatment recommendations. Recent research explores the application of deep learning in radiology, cardiology, and oncology. Future studies must address ethical concerns while improving accuracy and scalability across diverse populations.

üîπ Research Mode Summary:
Artificial Intelligence (AI) is transforming healthcare through advanced diagnostic tools, predictive analytics, and personalized treatment recommendations. Recent research explores the application of deep learning in radiology, cardiology, and oncology. However, challenges remain in data privacy, interpretability, and clinical validation. Future studies must address ethical concerns while improving accuracy and scalability across diverse populations.


In [39]:
sample_text = """
Artificial Intelligence (AI) is transforming healthcare through advanced diagnostic tools,
predictive analytics, and personalized treatment recommendations. Recent research explores
the application of deep learning in radiology, cardiology, and oncology. However, challenges
remain in data privacy, interpretability, and clinical validation. Future studies must address
ethical concerns while improving accuracy and scalability across diverse populations.
"""

# Run Explore Mode
explore_summary = generate_explore_summary(sample_text)
print("üîπ Explore Mode Summary:\n", explore_summary)

# Run Research Mode
research_summary = generate_research_summary(sample_text)
print("\nüîπ Research Mode Summary:\n", research_summary)


üîπ Explore Mode Summary:
 Artificial Intelligence (AI) is transforming healthcare through advanced diagnostic tools, predictive analytics, and personalized treatment recommendations. Recent research explores the application of deep learning in radiology, cardiology, and oncology. Future studies must address ethical concerns while improving accuracy and scalability across diverse populations.

üîπ Research Mode Summary:
 Artificial Intelligence (AI) is transforming healthcare through advanced diagnostic tools, predictive analytics, and personalized treatment recommendations. Recent research explores the application of deep learning in radiology, cardiology, and oncology. However, challenges remain in data privacy, interpretability, and clinical validation. Future studies must address ethical concerns while improving accuracy and scalability across diverse populations.


In [40]:
import pandas as pd

# Load your CSV
df = pd.read_csv("dataset_with_extractive_summaries.csv")  # replace with your actual filename

# Pick one paper (first row for example)
paper_text = df.loc[0, "full_text"]  # replace column name if different

explore_summary = generate_explore_summary(paper_text)
print("üîπ Explore Mode Summary:\n", explore_summary)

# Run Research Mode (7‚Äì8 sentences)
research_summary = generate_research_summary(paper_text)
print("\nüîπ Research Mode Summary:\n", research_summary)


üîπ Explore Mode Summary:
 Incontrast, enhanced ethylene concentration from ACC addition mayrepress the translation of OsCRW1/OsEBF1 and thus stabilize OsEIL1and OsEIL2 proteins, as has been shown in Arabidopsis 43.W h e ng r o w n in soil, the shorter and curvy roots of OsEIL1 andOsEIL2 overexpressing lines were largely rescued (Supplementary Fig. The Yeast one-hybrid (Y1H) and electrophoretic mobility shift assay(EMSA) demonstrated that OsERF82 speci Ô¨Åcally bound to the binding sites of the DRE/CRT, GCC-box and G-box in the promoter region ofOsRBOHH and three Class III OsPRX genes (Supplementary Fig. The pro- tein and probe were incubated with EMSA reaction solution (preparedaccording to the manufacturer ‚Äôs protocol, Beyotime, China) for 20 min at room temperature, separated on a 5% polyacrylamide native gel at4 ¬∞C, and transferred to a nylon membrane.

üîπ Research Mode Summary:
 Changes in nutrient supply, solution pH,aeration (dissolved oxygen level), and exposure to light 

In [41]:
import pandas as pd
from tqdm import tqdm

# Load your CSV (make sure it has a column like "full_text")
df = pd.read_csv("dataset_with_extractive_summaries.csv")  # change to your file name

# Add empty columns for summaries
df["explore_summary"] = ""
df["research_summary"] = ""

# Generate summaries for each row
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row["full_text"]  # replace with your column name

    try:
        # Short summary (Explore Mode)
        df.at[i, "explore_summary"] = generate_explore_summary(text)

        # Detailed summary (Research Mode)
        df.at[i, "research_summary"] = generate_research_summary(text)

    except Exception as e:
        print(f"Error at row {i}: {e}")
        df.at[i, "explore_summary"] = "Error generating explore summary"
        df.at[i, "research_summary"] = "Error generating research summary"

# Save to new CSV
df.to_csv("dataset_with_explore_and_research_summaries.csv", index=False)

print("‚úÖ All summaries generated and saved successfully!")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [01:07<00:00,  6.72s/it]

‚úÖ All summaries generated and saved successfully!





In [42]:
# Preview first 3 rows
preview = pd.read_csv("dataset_with_explore_and_research_summaries.csv")
print(preview[["id", "title", "explore_summary", "research_summary"]].head(3))


   id                                               title  \
0   1  Article https://doi.org/10.1038/s41467-025-59047-z   
1   2                          8.41598_2023_Article_49680   
2   3                         Cell Structure and Function   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   