RAG using Cohere

In [1]:
!pip install cohere




In [2]:
!pip install PyMuPDF

import fitz  # PyMuPDF



In [3]:
import os

In [3]:
# load stuff like api keys
from helpers import Dict
from taml import taml
# load stuff like api keys
env = Dict(taml.load('env.taml'))

# Approach-2 Tiered Knowledge space

Obtain Meta Data from Zotero JSON file and Obtain Publication Year

In [4]:
def extract_year(issued):
    """Extracts only the year from Zotero's issued date format."""
    if "date-parts" in issued and isinstance(issued["date-parts"], list):
        date_parts = issued["date-parts"][0]  # Extract first date-parts entry
        if len(date_parts) >= 1:
            return str(date_parts[0])  # Return only the year
    return "Unknown Year"

In [5]:
# get meta data from json file of Zotero

import json

# Load Zotero JSON export file
with open("Research Papers.json", "r", encoding="utf-8") as f:
    zotero_data = json.load(f)

# Extract relevant metadata fields
zotero_metadata = []
for item in zotero_data:
        metadata = {
            "title": item.get("title", "Unknown Title"),
            "abstract": item.get("abstract", "Unknown Abstract"),
            "DOI": item.get("DOI", "Unknown Source"),
            "year": extract_year(item.get("issued", {})),
        }
        zotero_metadata.append(metadata)    

print(f"Extracted {len(zotero_metadata)} journal articles from Zotero.")


Extracted 24 journal articles from Zotero.


### Scoring profile with ReRank Relevance, Citations and year of publication

Retrieved Contexts are ranked using scoring profile where Rerank Relevance is 70 % weightage. 20 % to number of citation and 10 percent for the year of publication

In [None]:
# obtain number of citations for a research paper
import re
import requests

def extract_arxiv_id(arxiv_number):
    """Extracts the numeric ArXiv ID from Zotero metadata."""
    match = re.search(r"arXiv:(\d+\.\d+)", arxiv_number)
    return match.group(1) if match else None

def get_citation_count(doi):
    """Fetches citation count using OpenCitations API if DOI is available."""
    if not doi:
        return 0  # If no DOI, return 0
    base_url = f"https://opencitations.net/index/coci/api/v1/citations/{doi}"
    response = requests.get(base_url)
    if response.status_code == 200:
        return len(response.json())  # Number of citing papers
    return 0

def get_arxiv_citations(arxiv_number):
    """Fetches citation count for an ArXiv paper using Semantic Scholar API."""
    arxiv_id = extract_arxiv_id(arxiv_number)
    if not arxiv_id:
        return 0  # Return 0 if extraction fails

    base_url = f"https://api.semanticscholar.org/v1/paper/arXiv:{arxiv_id}"
    response = requests.get(base_url)
    
    if response.status_code == 200:
        return response.json().get("citationCount", 0)  # Extract citation count
    return 0  # Return 0 if API fails


def enrich_paper_with_citations(paper):
    """Adds citation count to a paper by checking DOI or ArXiv ID."""
    if 'number' in paper:
        paper['arxiv_number'] = extract_arxiv_id(paper.get('number', ''))
        paper['citations'] = get_arxiv_citations(paper['arxiv_number'])
    elif "DOI" in paper:
        paper["citations"] = get_citation_count(paper["DOI"])
    else:
        paper["citations"] = 0  # Default if no identifier found
    return paper

# Apply citation retrieval to all papers in Zotero metadata
for paper in zotero_metadata:
    enrich_paper_with_citations(paper)



print(" Citation counts added to research papers.")


 Citation counts added to research papers.


Setup of rerank function for retreival of research papers

In [7]:
import cohere
# Rerank papers wrt user query 
def rerank_with_cohere(query, papers, top_n=20):
    """Ranks research papers using Cohere Rerank."""
    rerank_inputs = [f"Title: {paper['title']} Abstract: {paper['abstract']}" for paper in papers]
    rerank_response = co.rerank(model="rerank-english-v2.0", query=query, documents=rerank_inputs, top_n=top_n)
    reranked_papers = [papers[result.index] for result in rerank_response.results]
    
    # Attach rerank scores to papers
    for i, paper in enumerate(reranked_papers):
        paper["rerank_score"] = rerank_response.results[i].relevance_score
    
    return reranked_papers

In [None]:
# Funciton for research paper scoring profile 
import datetime

# Initialize Cohere API
co = cohere.Client(env.cohere_key_trial)

def apply_scoring_profile(papers, weight_rerank=0.7, weight_citations=0.2, weight_year=0.1):
    """Applies a scoring profile to research papers."""
    current_year = datetime.datetime.now().year  # Get the current year
    scored_papers = []
    for paper in papers:
        citations = int(paper.get("citations", 0))
        rerank_score = float(paper.get("rerank_score", 0))
        year = int(paper.get("year", 2000))  # Default to 2000 if missing

        # Normalize Year Score (latest year = higher score)
        year_score = (year - 2000) / (current_year - 2000)  # Normalized between 0 and 1

        # Normalize scores
        citation_score = min(citations / 1000, 1)  # Normalize citations (cap at 1)

        # Final Score
        final_score = (rerank_score * weight_rerank) + (citation_score * weight_citations) + (year_score * weight_year)

        # Store score
        scored_papers.append((paper, final_score))

    # Sort by highest score
    scored_papers.sort(key=lambda x: x[1], reverse=True)

    return [paper for paper, score in scored_papers]

def execute_ranking_pipeline(query, papers, top_n=3):
    """Executes full research paper ranking pipeline."""
    
    # Step 1: Rerank using Cohere
    reranked_papers = rerank_with_cohere(query, papers, top_n=5)
    # print(reranked_papers)
    
    # # Step 2: Fetch citations (DOI & ArXiv support)
    # for paper in reranked_papers:
    #     if "DOI" in paper:
    #         paper["citations"] = get_citation_count(paper["DOI"])
    #     elif "arxiv_id" in paper:
    #         paper["citations"] = get_arxiv_citations(paper["arxiv_id"])
    #     else:
    #         paper["citations"] = 0  # Default if no identifier

    # Step 3: Apply scoring profile (Relevance 0.7, Citations 0.2, Publicatio Year 0.1)
    final_ranked_papers = apply_scoring_profile(reranked_papers)

    # Return Top N Papers
    return final_ranked_papers[:top_n]



In [62]:
# Example Usage
query = "what is the future in self driving lab"
top_papers = execute_ranking_pipeline(query, zotero_metadata, top_n=3)

# Print final ranked results
for paper in top_papers:
    print(f" {paper['title']} (Year: {paper.get('year', 'Unknown')}, Citations: {paper['citations']})")


 The future of self-driving laboratories from human in the loop interactive AI to gamification (Year: 2024, Citations: 0)
 Navigation maps of the material space for automated self driving labs of the future (Year: 2024, Citations: 0)
 Autonomous Chemical Experiments Challenges and Perspectives on Establishing a Self Driving Lab (Year: 2022, Citations: 37)


Phase 2 Here we get the relevant context form the reranked research paper

#Create function Chunks from PDF files

In [52]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extract full text from a PDF file."""
    doc = fitz.open(pdf_path)
    full_text = ""
    
    for page in doc:
        full_text += page.get_text("text") + "\n"
    
    return full_text

def chunk_text(text, chunk_size=1500, overlap=150):
    """Chunk text into fixed-length segments with overlap."""
    words = text.split()  # Tokenizing by words (for simplicity)
    chunks = []
    
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap  # Shift start position with overlap

    return chunks

# Example usage
pdf_path = r"C:\Users\Chayan\OneDrive - University of Toronto\Desktop\Winter 25\M.Eng Project\Cohere Approach\pdfs\1-s2.0-S2590238524001954-main.pdf"
full_text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(full_text, chunk_size=1500, overlap=150)

# Print the first 2 chunks for preview
for i, chunk in enumerate(chunks[:2]):
    print(f"=== Chunk {i+1} ===\n{chunk[:1500]}\n")  # Show first 500 characters


=== Chunk 1 ===
Article ChemOS 2.0: An orchestration architecture for chemical self-driving laboratories ChemOS 2.0 is a comprehensive laboratory architecture for transforming the modern chemistry lab into one that accelerates the pace of chemical research. This new kind of laboratory, known as a self-driving lab (SDL), uses automated experimental tools, as well as computational experiment planners, to create fully automated workﬂows that require minimal human intervention. ChemOS 2.0 presents a modular and versatile approach to building one’s own SDL, including real-life implementations of this framework. Malcolm Sim, Mohammad Ghazi Vakili, Felix Strieth-Kalthoff, ..., Santiago Miret, Sergio Pablo-Garcı´a, Ala´n Aspuru-Guzik spgarcica@gmail.com (S.P.-G.) alan@aspuru.com (A.A.-G.) Highlights A modular strategy for building a self-driving lab for chemical research Demonstrative workﬂows based on real-world research in materials discovery High- and low-level implementation of laboratory 

In [53]:
pdf_folder= r"C:\Users\Chayan\OneDrive - University of Toronto\Desktop\Winter 25\M.Eng Project\Cohere Approach\pdfs\Final Papers"

for paper in top_papers:
    pdf_path = os.path.join(pdf_folder, f"{paper['title']}.pdf")  # Locate PDF by title

    if os.path.exists(pdf_path):
        full_text = extract_text_from_pdf(pdf_path)  # Extract full text
        chunks = chunk_text(full_text, chunk_size=1500, overlap=150)  # Chunk the text

        # Store chunks in paper dictionary
        paper["chunks"] = chunks
    else:
        print(f"PDF not found: {pdf_path}")


In [55]:

def rerank_chunks(query, chunks, top_n=3):
    """Ranks research papers using Cohere Rerank."""
    rerank_response = co.rerank(model="rerank-english-v2.0", query=query, documents=chunks, top_n=top_n)
    reranked_papers = [chunks[result.index] for result in rerank_response.results]
    
    # Attach rerank scores to papers
    for i, paper in enumerate(reranked_papers):
        paper["rerank_score"] = rerank_response.results[i].relevance_score
    
    return reranked_papers




In [40]:
import os

def execute_full_pipeline(query, zotero_metadata, pdf_folder, top_n_papers=3, chunk_size=1500, overlap=150, top_n_chunks=5):
    """
    Execute Phase 1 (Retrieve top papers) and Phase 2 (Extract, chunk, and rerank text).
    
    Args:
        query (str): The query for retrieving relevant papers.
        zotero_metadata (dict): Metadata from Zotero containing paper titles and other details.
        pdf_folder (str): The folder containing the research PDFs.
        top_n_papers (int): Number of top-ranked papers to retrieve.
        chunk_size (int): Size of text chunks.
        overlap (int): Overlap between chunks.
        top_n_chunks (int): Number of top-ranked chunks to return.

    Returns:
        list: Top-ranked chunks after reranking.
    """
    
    # **Phase 1: Retrieve Top Papers**
    top_papers = execute_ranking_pipeline(query, zotero_metadata, top_n=top_n_papers)

    # **Phase 2: Extract and Chunk Text**
    for paper in top_papers:
        pdf_path = os.path.join(pdf_folder, f"{paper['title']}.pdf")  # Locate PDF by title
        
        if os.path.exists(pdf_path):
            full_text = extract_text_from_pdf(pdf_path)  # Extract text from PDF
            chunks = chunk_text(full_text, chunk_size=chunk_size, overlap=overlap)  # Chunking
            
            # Store chunks in paper dictionary
            paper["chunks"] = chunks
        else:
            print(f"PDF not found: {pdf_path}")
            paper["chunks"] = []  # Assign empty list if file is missing

    # **Prepare Chunks for Reranking**
    all_chunks = [{"title": paper["title"], "text": chunk} for paper in top_papers for chunk in paper["chunks"]]

    # **Rerank Chunks**
    ranked_chunks = rerank_chunks(query, all_chunks, top_n=top_n_chunks)

    return ranked_chunks


In [41]:
query = "What are the challenges of self-driving labs?"
pdf_folder= r"C:\Users\Chayan\OneDrive - University of Toronto\Desktop\Winter 25\M.Eng Project\Cohere Approach\pdfs\Final Papers"

# Execute full pipeline (Phase 1 + Phase 2)
ranked_chunks = execute_full_pipeline(query, zotero_metadata, pdf_folder)

# Print the top-ranked chunks
for i, chunk in enumerate(ranked_chunks):
    print(f"=== Chunk {i+1} from {chunk['title']} ===\n{chunk['text'][:2500]}...\n")

=== Chunk 1 from Autonomous Chemical Experiments Challenges and Perspectives on Establishing a Self Driving Lab ===
mixing with other reagents, or even changing the solvent via evaporation and redissolution. Automated experiments where the instruments are linked by stepwise robotic sample transport28,39 require less process optimization for each measurement as the intermediates can simply be purified, and samples can be prepared between successive experiments. In contrast, in-line processes must be optimized so that the initial sample is suitable for all subsequent experiments. Nevertheless, properly optimized in- line processes have the potential to lead to substantial time savings. The advantage of in-line “discovery mode” synthesis is that only very small amounts of material are required for full characterization, which makes it possible to run reactions on a much smaller scale and results in more efficient utilization of both money and materials. However, an important drawback of t

In [None]:
query = "What is  self-driving labs?"
pdf_folder= r"C:\Users\Chayan\OneDrive - University of Toronto\Desktop\Winter 25\M.Eng Project\Cohere Approach\pdfs\Final Papers"

# Execute Retrieval full pipeline (Phase 1 + Phase 2)
ranked_chunks = execute_full_pipeline(query, zotero_metadata, pdf_folder)

# Print the top-ranked chunks
for i, chunk in enumerate(ranked_chunks):
    print(f"=== Chunk {i+1} from {chunk['title']} ===\n{chunk['text'][:2500]}...\n")

=== Chunk 1 from Research Trend Analysis in the Field of Self Driving Lab Using Network Analysis and Topic Modeling ===
Article Not peer-reviewed version Research Trend Analysis in the Field of Self-Driving Lab Using Network Analysis and Topic Modeling Woojun Jung , Insung Hwang , Keuntae Cho * Posted Date: 6 December 2024 doi: 10.20944/preprints202412.0512.v1 Keywords: Self-Driving Lab; SDL; Trend analysis; Network analysis; Topic modeling; Sustainability Preprints.org is a free multidisciplinary platform providing preprint service that is dedicated to making early versions of research outputs permanently available and citable. Preprints posted at Preprints.org appear in Web of Science, Crossref, Google Scholar, Scilit, Europe PMC. Copyright: This open access article is published under a Creative Commons CC BY 4.0 license, which permit the free download, distribution, and reuse, provided that the author and preprint are cited in any reuse. Article Research Trend Analysis in the Field 

## Generating Responses from the retrieved context chunks

In [None]:
import cohere  # Cohere API client

# Initialize Cohere client (replace with your API key)
co = cohere.Client(api_key=env.cohere_key)

def response_to_chunks(query, ranked_chunks, model="command-r-plus-08-2024"):
    """
    Generates an LLM response based on retrieved and reranked research chunks.

    Args:
        query (str): The user query.
        ranked_chunks (list): List of top-ranked text chunks.
        model (str): Cohere model name for chat generation.

    Returns:
        tuple: (Generated response text, citations)
    """
    # Execute Retrieval full pipeline (Phase 1 + Phase 2)
    ranked_chunks = execute_full_pipeline(query, zotero_metadata, pdf_folder)
    
    # **Fix: Ensure `data` is a String (Not a Dictionary)**
    documents = [{"data": f"{chunk['title']}: {chunk['text']}"} for chunk in ranked_chunks]

    # **System Prompt for Context Awareness**
    system_prompt = (
        "You are an AI research assistant specializing in self-driving labs and automation. "
        "Use the provided research excerpts to answer queries accurately. "
        "Avoid speculation—if an answer is not found in the documents, state it explicitly."
    )

    # **Format the Query**
    user_message = (
        f"Format your response as follows:\n"
        "**Query:** {query}\n"
        "- **Summary:** (Concise response)\n"
        "- **Key points:** (List the most important points)\n"
        "- **Sources:** (List titles of the relevant research excerpts)"
        
    )

    # **Call Cohere's `chat()` API Correctly**
    response = co.chat(
        model=model,
        message=user_message,  # Fix: Using `message` instead of `messages`
        documents=documents,  # Pass the context as `documents`
        preamble=system_prompt  # Fix: Use `preamble` for system instructions
    )

    return response.text, response.citations  # Extract response text & citations


In [20]:
query = "What are the challenges of self-driving labs?"

# Execute full pipeline (Phase 1 + Phase 2 + Phase 3)
final_response, citation = response_to_chunks(query, ranked_chunks)

# Print the response from the model
print("=== FINAL RESPONSE FROM LLM ===")
print(final_response)


document id=doc_0 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_1 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_2 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_3 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_4 is too long and may provide bad results, please chunk your documents to 300 words or less


=== FINAL RESPONSE FROM LLM ===
**Query:** What are the key challenges in establishing a self-driving lab?
- **Summary:** Self-driving labs are a rapidly growing field that offers incredible potential in improving the rate and scope of research in chemistry and materials science. However, there are several challenges in establishing a self-driving lab, which can be split into cognitive and motor function.
- **Key Challenges:**
  - **Cognitive Challenges:**
    - Replacing human cognitive processes with ML algorithms in the "real world" of chemistry.
    - Encountering unexpected or difficult-to-predict results.
    - Automating instruments designed for human use.
    - Optimization with constraints or unexpected outcomes.
    - Software control and integration, as few instrument manufacturers design their products with self-driving labs in mind.
  - **Motor Function Challenges:**
    - Replacing or replicating certain actions that are easy for humans due to their fine motor skills and 

In [60]:
query = "What are the research trends in the field of self driving labs"

# Execute full pipeline (Phase 1 + Phase 2 + Phase 3)
final_response, citation = response_to_chunks(query, ranked_chunks)

# Print the response from the model
print("=== FINAL RESPONSE FROM LLM ===")
print(final_response)

document id=doc_0 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_1 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_2 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_3 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_4 is too long and may provide bad results, please chunk your documents to 300 words or less


=== FINAL RESPONSE FROM LLM ===
**Query:** What are the key findings of the research trend analysis in the field of self-driving labs?
- **Summary:** The research trend analysis in the field of self-driving labs (SDLs) reveals three key findings: a surge in SDL research since 2019, the identification of influential researchers as central figures in the network, and the interdisciplinary convergence of SDL research.
- **Key points:**
   - SDL research has experienced a significant increase since 2019, driven by the pandemic and advancements in AI technologies.
   - Several influential researchers have been identified as pivotal contributors to the network, playing essential roles in collaboration and information dissemination.
   - SDL research exhibits interdisciplinary convergence, encompassing areas such as material optimization, biological processes, and AI predictive algorithms.
- **Sources:**
   - Research Trend Analysis in the Field of Self Driving Lab Using Network Analysis and 

In [63]:
query = "What is the future  of self driving labs"

# Execute full pipeline (Phase 1 + Phase 2 + Phase 3)
final_response, citation = response_to_chunks(query, ranked_chunks)

# Print the response from the model
print("=== FINAL RESPONSE FROM LLM ===")
print(final_response)

document id=doc_0 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_1 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_2 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_3 is too long and may provide bad results, please chunk your documents to 300 words or less, document id=doc_4 is too long and may provide bad results, please chunk your documents to 300 words or less


=== FINAL RESPONSE FROM LLM ===
**Query:** What are the key findings of the research trend analysis in the field of self-driving labs?
- **Summary:** The research trend analysis in the field of self-driving labs (SDLs) reveals three key findings: a surge in SDL research since 2019, the identification of influential researchers as central figures in the network, and the interdisciplinary convergence of SDL research.
- **Key points:**
   - SDL research has experienced a significant increase since 2019, driven by the pandemic and advancements in AI technologies.
   - Several influential researchers have been identified as pivotal contributors, playing essential roles in collaboration and information dissemination.
   - SDL research demonstrates interdisciplinary convergence, encompassing areas such as material optimization, biological processes, and AI predictive algorithms.
- **Sources:**
   - Research Trend Analysis in the Field of Self Driving Lab Using Network Analysis and Topic Model