In [1]:
pip install tavily-python

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install requests pandas matplotlib nltk scikit-learn transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install streamlit plotly matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [7]:
prompt = "Advancements in medical robotics"

In [8]:
import asyncio
import nest_asyncio
from tavily import AsyncTavilyClient
import pandas as pd
import re
from dateutil import parser
from datetime import datetime
from typing import List

# Allow nested event loops (Required for Jupyter Notebooks or IPython environments)
nest_asyncio.apply()

# Initialize the AsyncTavilyClient with your API key
tavily_client = AsyncTavilyClient(api_key="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")

def extract_date(text: str) -> str:
    """
    Extracts a valid date from the provided text using regex and dateutil.
    Returns the first valid date found, or 'N/A' if none are found.
    Ensures the date is not from the future.
    """
    if not text or text == "N/A":
        return "N/A"

    date_patterns = [
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}',
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}',
        r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}',
        r'\b\d{4}-\d{2}-\d{2}', 
        r'\b\d{1,2}/\d{1,2}/\d{2,4}', 
        r'\b\d{4}', 
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}', 
        r'\bQ[1-4]\s+\d{4}',  
        r'\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)',  
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}',  
        r'\b\d{1,2}-\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec),?\s+\d{4}',  
    ]

    current_date = datetime.now().date()

    for pattern in date_patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            try:
                date = parser.parse(match, fuzzy=True).date()
                if date <= current_date:
                    return date.isoformat()
            except (ValueError, OverflowError):
                continue

    return "N/A"

def extract_publisher(content: str, blurb: str) -> str:
    """
    Extracts the publisher name from content or blurb using keyword matching and regex.
    """
    if not content and not blurb:
        return "N/A"

    publisher_patterns = [
        r'Published by ([^\n]+)',            
        r'By ([^\n]+)',                      
        r'Forbes Councils Member',           
        r'PubMed',                           
        r'World Economic Forum',             
        r'PMC',                              
        r'DynamXMedical',                    
        r'Archives',                         
        r'(?:Source|Publisher|Published by|By|From)\s*[:\-–]?\s*([^\n]+)', 
    ]

    text_to_search = f"{content}\n{blurb}".lower()

    for pattern in publisher_patterns:
        match = re.search(pattern, text_to_search, re.IGNORECASE)
        if match:
            publisher = match.group(1).strip() if match.group(1) else pattern
            return publisher.capitalize()
    
    return "N/A"

def extract_blurb(content: str, max_length: int = 250) -> str:
    """
    Extracts a short blurb (summary) from the content text.
    """
    if not content or content == "N/A":
        return "N/A"
    
    sentences = re.split(r'(?<=[.!?]) +', content.strip())
    blurb = ' '.join(sentences[:3])  
    return blurb[:max_length] + "..." if len(blurb) > max_length else blurb

async def fetch_and_extract(prompt: str):
    queries = [
        {"query": prompt, "search_depth": "advanced", "max_results": 10, "time_range": "year"}
    ]

    responses = await asyncio.gather(*[
        tavily_client.search(**q, include_raw_content=True, extract_depth="advanced") for q in queries
    ])

    results_data = []
    seen_urls = set()  

    for response in responses:
        for result in response.get('results', []):
            if result.get('score', 0) > 0.5:
                url = result.get("url", "N/A")
                if url not in seen_urls and url != "N/A":
                    seen_urls.add(url)

                    raw_content = result.get("raw_content", "N/A")
                    blurb = extract_blurb(raw_content)

                    extracted_date = extract_date(raw_content) if raw_content != "N/A" else "N/A"
                    
                    api_published_date = result.get("published_date", "N/A")
                    if api_published_date != "N/A":
                        try:
                            api_date_obj = parser.parse(api_published_date).date()
                            if api_date_obj > datetime.now().date():
                                api_published_date = "N/A" 
                        except (ValueError, OverflowError):
                            api_published_date = "N/A"

                    final_published_date = api_published_date if api_published_date != "N/A" else extracted_date
                    
                    publisher = extract_publisher(raw_content, blurb)
                    
                    results_data.append({
                        "Title": result.get("title", "N/A"),
                        "URL": url,
                        "Content": result.get("content", "N/A"),
                        "Score": result.get("score", "N/A"),
                        "Published Date": final_published_date,
                        "Publisher": publisher,
                        "Blurb": blurb,
                    })

    df = pd.DataFrame(results_data)
    pd.set_option('display.max_colwidth', 500)  

    print(f"Total Unique Results Found: {len(df)}")
    print(df.head(10))  
    return df  

# Run the asynchronous function using asyncio in Jupyter Notebook
df = await fetch_and_extract(prompt)

Total Unique Results Found: 10
                                                                      Title  \
0  Advancements in Robotic Surgery: A Comprehensive Overview of Current ...   
1         Revolutionizing healthcare and medicine: The impact of modern ...   
2                      The role of robotics in modern surgery - HighSurgery   
3                  Innovations in Medical Robotics: Transforming Healthcare   
4     The Future Of Surgical Robotics: Innovations And Predictions - Forbes   
5   Advancements in robotic surgery: innovations, challenges and future ...   
6                          5 ways that robotics are transforming healthcare   
7  Editorial: Translational research in medical robotics—challenges and ...   
8     Latest Medical Technology Trends and Breakthrough Innovations in 2025   
9           Advances in Robotic Surgery: A Review of New Surgical Platforms   

                                                                                                   

In [9]:
#See app.py to visualize using Anaconda prompt

In [None]:
import asyncio
import nest_asyncio
from tavily import AsyncTavilyClient
import pandas as pd
import re
from dateutil import parser
from datetime import datetime
from typing import List

# Allow nested event loops (Required for Jupyter Notebooks or IPython environments)
nest_asyncio.apply()

# Initialize the AsyncTavilyClient with your API key
tavily_client = AsyncTavilyClient(api_key="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")

def extract_date(text: str) -> str:
    """
    Extracts a valid date from the provided text using regex and dateutil.
    Returns the first valid date found, or 'N/A' if none are found.
    Ensures the date is not from the future.
    """
    if not text or text == "N/A":
        return "N/A"

    date_patterns = [
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}',
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}',
        r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}',
        r'\b\d{4}-\d{2}-\d{2}', 
        r'\b\d{1,2}/\d{1,2}/\d{2,4}', 
        r'\b\d{4}', 
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}', 
        r'\bQ[1-4]\s+\d{4}',  
        r'\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)',  
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{4}',  
        r'\b\d{1,2}-\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec),?\s+\d{4}',  
    ]

    current_date = datetime.now().date()

    for pattern in date_patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            try:
                date = parser.parse(match, fuzzy=True).date()
                if date <= current_date:
                    return date.isoformat()
            except (ValueError, OverflowError):
                continue

    return "N/A"

def extract_publisher(content: str, blurb: str) -> str:
    """
    Extracts the publisher name from content or blurb using keyword matching and regex.
    """
    if not content and not blurb:
        return "N/A"

    publisher_patterns = [
        r'Published by ([^\n]+)',            
        r'By ([^\n]+)',                      
        r'Forbes Councils Member',           
        r'PubMed',                           
        r'World Economic Forum',             
        r'PMC',                              
        r'DynamXMedical',                    
        r'Archives',                         
        r'(?:Source|Publisher|Published by|By|From)\s*[:\-–]?\s*([^\n]+)', 
    ]

    text_to_search = f"{content}\n{blurb}".lower()

    for pattern in publisher_patterns:
        match = re.search(pattern, text_to_search, re.IGNORECASE)
        if match:
            publisher = match.group(1).strip() if match.group(1) else pattern
            return publisher.capitalize()
    
    return "N/A"

def extract_blurb(content: str, max_length: int = 250) -> str:
    """
    Extracts a short blurb (summary) from the content text.
    """
    if not content or content == "N/A":
        return "N/A"
    
    sentences = re.split(r'(?<=[.!?]) +', content.strip())
    blurb = ' '.join(sentences[:3])  
    return blurb[:max_length] + "..." if len(blurb) > max_length else blurb

async def fetch_and_extract(prompt: str):
    queries = [
        {"query": prompt, "search_depth": "advanced", "max_results": 10, "time_range": "year"}
    ]

    responses = await asyncio.gather(*[
        tavily_client.search(**q, include_raw_content=True, extract_depth="advanced") for q in queries
    ])

    results_data = []
    seen_urls = set()  

    for response in responses:
        for result in response.get('results', []):
            if result.get('score', 0) > 0.5:
                url = result.get("url", "N/A")
                if url not in seen_urls and url != "N/A":
                    seen_urls.add(url)

                    raw_content = result.get("raw_content", "N/A")
                    blurb = extract_blurb(raw_content)

                    extracted_date = extract_date(raw_content) if raw_content != "N/A" else "N/A"
                    
                    api_published_date = result.get("published_date", "N/A")
                    if api_published_date != "N/A":
                        try:
                            api_date_obj = parser.parse(api_published_date).date()
                            if api_date_obj > datetime.now().date():
                                api_published_date = "N/A" 
                        except (ValueError, OverflowError):
                            api_published_date = "N/A"

                    final_published_date = api_published_date if api_published_date != "N/A" else extracted_date
                    
                    publisher = extract_publisher(raw_content, blurb)
                    
                    results_data.append({
                        "Title": result.get("title", "N/A"),
                        "URL": url,
                        "Content": result.get("content", "N/A"),
                        "Score": result.get("score", "N/A"),
                        "Published Date": final_published_date,
                        "Publisher": publisher,
                        "Blurb": blurb,
                    })

    df = pd.DataFrame(results_data)
    pd.set_option('display.max_colwidth', 500)  

    print(f"Total Unique Results Found: {len(df)}")
    print(df.head(10))  
    return df  

# Run the asynchronous function using asyncio in Jupyter Notebook
df = await fetch_and_extract(prompt)


