In [1]:
import pandas as pd

In [2]:
# Load the notes dataset
notes_df = pd.read_csv('C:/Users/dinon/Desktop/Summary/biology_information_retrieval_sample.csv', encoding='ISO-8859-1')  # Update with the correct file path
notes_content = notes_df['Text Content'].tolist()
notes_topics = notes_df['Topic'].tolist()
notes_subtopics = notes_df['Sub-topic'].tolist()

In [3]:
# Load the summarization dataset
summary_df = pd.read_csv('C:/Users/dinon/Desktop/Summary/bio_summary_key.csv', encoding='ISO-8859-1')
long_texts = summary_df['longtext'].tolist()
summaries = summary_df['summary'].tolist()
keywords = summary_df['keywords'].tolist()

In [4]:
pip install sentence-transformers faiss-cpu

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for summarization dataset (long texts)
summary_embeddings = embedder.encode(long_texts)

# Generate embeddings for notes dataset
notes_embeddings = embedder.encode(notes_content)

# Combine all content and embeddings for FAISS indexing
all_content = long_texts + notes_content
all_embeddings = np.concatenate([summary_embeddings, notes_embeddings], axis=0)

# Convert embeddings to a float32 NumPy array
all_embeddings_array = np.array(all_embeddings).astype("float32")

# Create and populate the FAISS index
common_index = faiss.IndexFlatL2(all_embeddings_array.shape[1])
common_index.add(all_embeddings_array)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def generate_summary_for_keyword(query, max_words=150):
    # Retrieve similar content from the common index
    similar_content = retrieve_similar_content(query, k=5)
    
    # Construct context from retrieved content
    context = ""
    for item in similar_content:
        if item['type'] == 'summary':
            context += f"Keyword: {item['keyword']}\nLong Text: {item['long_text']}\n\n"
        elif item['type'] == 'note':
            context += f"Topic: {item['topic']} - {item['sub_topic']}\nContent: {item['content']}\n\n"
    
    # Generate the prompt with query and context for summarization
    prompt = f"Summarize the following content related to '{query}':\n{context}\nSummary:"
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=max_words, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary


In [7]:
def generate_summary_for_long_text(long_text, max_words=150):
    # Create a prompt for summarizing the long text
    prompt = f"Summarize the following text:\n{long_text}\nSummary:"
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=max_words, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary


In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Path to your fine-tuned model
model_path = 'C:/Users/dinon/Desktop/Summary/flan_t5_finetuned_model-20241114T172316Z-001/flan_t5_finetuned_model'  # Update with the correct path

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)


In [11]:
# Test with a long text
sample_long_text = """
as its name suggests rubisco is capable of catalyzing two distinct reactions acting as both a carboxylase and as an oxygenase in the oxygenase reaction of rubisco uses the same substrate rubp but reacts it with o 2 the reaction is catalyzed on the same active site as the carboxylation reaction thus co 2 and o2 are competitive substrates therefore co2 inhibits the oxygenase and o 2 inhibits the carboxylase reaction the oxygenase reaction forms just one molecule of 3pga plus a twocarbon product 2phosphoglycolate which is of no immediate use in the calvin cycle and in higher concentrations it is toxic for the plant it therefore has to be processed in a metabolic pathway called photorespiration the photorespiratory pathway involves enzymes in the chloroplasts peroxisome and mitochondria detail of this pathway is not expected photorespiration is not only energy demanding but furthermore leads to a net loss of co 2 each time rubisco reacts with o2 instead of co2 the plants makes 50 less 3pga than it would have done if co 2 had been used this potentially eliminates the net gain in photosynthetic carbon and loose the productivity these two factors result in an increase in photorespiration relative to photosynthesis so that an increasing proportion of carbon is lost as the temperature rises the co 2 required for photosynthesis enters a leaf via stomata however stomata are also the main avenues of transpiration on a hot dry day most plants close their stomata in order to conserves water at the same time o 2 released from the light reactions begins to increase and this leads to further reduction of co2 to o2 ratio in the cytosol these conditions within the leaf favor a wasteful process photorespiration under high temperature dryness and high light intensities therefore plants developed different way to cope with this situation during the evolution that resulted a most successful solution to concentrate co 2 around rubisco provided by c4 photosynthetic pathway the establishment of c4 photosynthetic pathway includes several biochemical and anatomical modifications that allow plants with this pathway to concentrate co2 at the site of rubisco thereby its oxygenase reaction and the following photorespiration are largely repressed in c4 plants in most c4 plants the co2 concentration mechanism is achieved by a division of labor between two distinct specialized leaf cell types the mesophyll and the bundle sheath cells compared to c3 plants the bundle sheath cells of c4 plants have expanded physiological functions this is reflected by the enlargement and higher organelle content of these cells in c4 species for the efficient function of the c4 pathway a close contact between mesophyll and bundle sheath cells are tightly interconnected to each other by high numbers of plasmodesmata the bundle sheath cells enclose the vascular bundles and are themselves surrounded by the mesophyll cells and this type of leaf anatomy was termed kranz anatomy since rubisco can operate under high co2 concentrations in the bundle sheath cells it works more efficiently than in c3 plants because of the co2 concentration mechanism they can acquire enough co2 even when keeping their stomata more closed and minimize the water loss by transpiration c4 pathway of photosynthesis in the mesophyll cells of c4 plants co2 is converted to bicarbonate by carbonic anhydrase and initially fixed by phosphoenolpyruvate carboxylase using pep as co2 acceptor the resulting oxaloacetate oaa is composed of four carbon atoms which is the basis for the name of this metabolic pathway oxaloacetate is rapidly converted to the more stable c4 acids malate or aspartate that diffuse to the bundle sheath cells here co2 is released by decarboxylating enzymes and the released co2 is refixed by rubisco which exclusively operates in the bundle sheath cells in c4 plants chloroplasts found in mesophyll cells are different in anatomy in comparison to chloroplasts of bundle sheath cells since chloroplasts of mesophyll cells carryout only light reaction they are rich in grana the grana of mesophyll chloroplasts are large and highly differentiated for light reaction bundle sheath chloroplasts possess a very few less differentiated grana or grana are absent moreover that psii in the bundle sheath cells are depleted in order to lower oxygen production in these cells this pep carboxylase enzyme is much more efficient than the enzyme of rubp carboxylase for two reasons 1 it reacts with bicarbonate hco3 rather than with co2 the advantage of this is that there is a 50fold higher concentration of hco3 than co2 in solution in the cytosol 2 it has no affinity for o2 significance of the c4 pathway helps plants to improve the efficiency of co2 fixation at lower co2concentrations by preventing the gateways for photorespiration by spatially separating rubisco in hotdry climate it is essential for the stomata to close to prevent water loss through transpiration this reduces co2 intake of particular plants therefore plants in tropical zones or hot climate may suffer from co2 deficiency at lower co2 concentrations c4 mechanism increases the efficiency of photosynthesis by concentrating co2 in the bundle sheath cells c4 plants exhibit better wateruse efficiency than c3 plants because of the co2 concentration mechanism they can acquire enough co2 even when keeping their stomata more closed thus water loss by transpiration is reduced since rubisco can operate under high co2 concentrations in the bundle sheath cells it woks more efficiently than in c3 plants consequently c4 plants need less of this enzyme this leads to a better nitrogenuse efficiency of c4 compared to c3 plants comparisons of c3 and c4 plants c3 plants such as wheat rice and barley have an optimum temperature for photosynthesis between 1525c in these plants co fixation occurs only once with the co acceptor being a 5carbon compound called rubp the enzyme responsible for fixing co is rubisco which produces a 3carbon compound 3phosphoglycerate 3pga as the first product the leaf anatomy in c3 plants involves mesophyll cells where photosynthesis occurs with nongreen bundle sheath cells c3 plants typically have lower productivity c4 plants including maize sugarcane and grasses thrive in higher temperatures with photosynthesis being 50 more efficient at 35c co fixation occurs twice first in the mesophyll cells and then in the bundle sheath cells the initial co acceptor is a 3carbon compound pep in the mesophyll cells and rubp in the bundle sheath cells pep carboxylase an efficient enzyme fixes co in the mesophyll while rubisco operates efficiently under high co concentrations in the bundle sheath cells the first product of co fixation in c4 plants is a 4carbon acid oxaloacetate oaa c4 plants have kranz anatomy where photosynthesis occurs in both mesophyll and bundle sheath cells they are generally more productive yielding higher outputs than c3 plants factors affecting photosynthesis the rate of photosynthesis is an important factor in crop production rate is affected by various factors eg light intensity co2 concentration temperature water pollutants and inhibitors the photosynthesis involves a series of reactions therefore various factors are involved in it blackman who is the scientist first proposed the idea of principal of limiting factors when a chemical process is affected by more than one factor its rate is limited by the factor which is nearest its minimum value eg light intensity light intensity the rate of photosynthesis increases linearly with increasing light intensity gradually the rate of increase falls off as the other factors become limiting very high light intensities chlorophyll may bleach and slow down photosynthesis however plants exposed to such conditions are usually protected by devices such as thick cuticles hairy leaves under normal conditions co2 is the major limiting factor in photosynthesis increase in photosynthetic rate is achieved by increasing co2 concentration for example some greenhouse crops such as tomatoes are grown in co2 enriched atmosphere
"""


In [12]:
print("Generated Summary for Long Text:")
print(generate_summary_for_long_text(sample_long_text, max_words=150))


Generated Summary for Long Text:
c4 photosynthetic pathway is a biochemical and anatomical modification that allows plants to concentrate co2 at the site of rubisco thereby its oxygenase c4 photosynthetic pathway is a biochemical and anatomical modification that allows plants to concentrate co2 at the site of rubisco thereby its oxygenase c4 photosynthetic pathway is a biochemical and anatomical modification that allows plants to concentrate co2 at the site of rubisco thereby its oxygenase c4 photosynthetic pathway is a biochemical and anatomical modification that allows plants to concentrate co2 at the site of rubisco
