In [3]:
import pandas as pd

In [4]:
# Load the CSV file and inspect its columns
qa_df = pd.read_csv('C:/Users/dinon/Desktop/Summary/biology_information_retrieval_sample.csv', encoding='ISO-8859-1')
print(qa_df.columns)

Index(['Document ID', 'Topic', 'Sub-topic', 'Text Content', 'Source'], dtype='object')


In [5]:
# Load the notes dataset
notes_df = pd.read_csv('C:/Users/dinon/Desktop/Summary/biology_information_retrieval_sample.csv', encoding='ISO-8859-1')  # Update with the correct file path
notes_content = notes_df['Text Content'].tolist()
notes_topics = notes_df['Topic'].tolist()
notes_subtopics = notes_df['Sub-topic'].tolist()

In [6]:
# Load the CSV file and inspect its columns
qa_df = pd.read_csv('C:/Users/dinon/Desktop/Summary/bio_summary_key.csv', encoding='ISO-8859-1')
print(qa_df.columns)

Index(['longtext', 'summary', 'keywords'], dtype='object')


In [7]:
# Load the summarization dataset
summary_df = pd.read_csv('C:/Users/dinon/Desktop/Summary/bio_summary_key.csv', encoding='ISO-8859-1')
long_texts = summary_df['longtext'].tolist()
summaries = summary_df['summary'].tolist()
keywords = summary_df['keywords'].tolist()

In [8]:
pip install sentence-transformers faiss-cpu

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for summarization dataset (long texts)
summary_embeddings = embedder.encode(long_texts)

# Generate embeddings for notes dataset
notes_embeddings = embedder.encode(notes_content)

# Combine all content and embeddings for FAISS indexing
all_content = long_texts + notes_content
all_embeddings = np.concatenate([summary_embeddings, notes_embeddings], axis=0)

# Convert embeddings to a float32 NumPy array
all_embeddings_array = np.array(all_embeddings).astype("float32")

# Create and populate the FAISS index
common_index = faiss.IndexFlatL2(all_embeddings_array.shape[1])
common_index.add(all_embeddings_array)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def retrieve_similar_content(query, k=5):
    # Generate embedding for the query
    query_embedding = embedder.encode([query]).astype("float32")

    # Search the FAISS index for similar content
    distances, indices = common_index.search(query_embedding, k)
    
    # Retrieve and categorize results
    results = []
    for i in indices[0]:
        if i < len(long_texts):  # Entry from summarization dataset
            results.append({
                'type': 'summary',
                'long_text': long_texts[i],
                'summary': summaries[i],
                'keyword': keywords[i]
            })
        else:  # Entry from notes dataset
            idx = i - len(long_texts)
            results.append({
                'type': 'note',
                'topic': notes_topics[idx],
                'sub_topic': notes_subtopics[idx],
                'content': notes_content[idx]
            })
    return results


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Path to your fine-tuned model
model_path = 'C:/Users/dinon/Desktop/Summary/flan_t5_finetuned_model-20241114T172316Z-001/flan_t5_finetuned_model'  # Update with the correct path

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)


In [12]:
def generate_summary(query, max_words=100):
    # Retrieve similar content from the common index
    similar_content = retrieve_similar_content(query, k=5)
    
    # Construct context from retrieved content
    context = ""
    for item in similar_content:
        if item['type'] == 'summary':
            context += f"Keyword: {item['keyword']}\nLong Text: {item['long_text']}\n\n"
        elif item['type'] == 'note':
            context += f"Topic: {item['topic']} - {item['sub_topic']}\nContent: {item['content']}\n\n"
    
    # Generate the prompt with query and context for summarization
    prompt = f"Summarize the following content related to '{query}':\n{context}\nSummary:"
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=max_words, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary


In [13]:
# Test with a sample query for summarization
sample_query = "Importance of biodiversity"

In [15]:
print("Generated Summary:")
print(generate_summary(sample_query, max_words=150))

Generated Summary:
Biodiversity is the diversity of living organisms from all sources including terrestrial, marine, and other aquatic ecosystems and their ecological interactions with the environment. Genetic diversity is the genetic variation that exists both within and among species. b. Species diversity is the variation that can be recognized among different species. c. Ecosystem diversity is the variety of habitats, living communities and ecological processes in the living world. The importance and values of biodiversity The individual components of biodiversitygenes, species, and ecosystems provide the human society with a wide array of goods and services.
