In [35]:
from datetime import date
from pprint import pprint
from typing import TypedDict
import json

from dotenv import find_dotenv, load_dotenv
from semanticscholar import SemanticScholar
import google.generativeai as genai
import os

In [5]:
load_dotenv(find_dotenv())
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [30]:
sch = SemanticScholar(timeout=15)

def search_semantic_scholar(query: str, year: tuple[int, int]) -> list:
    return sch.search_paper(
        query=query, 
        year=f"{year[0]}-{year[1]}", 
        open_access_pdf=True,
        fields_of_study=["Computer Science"], 
        fields=["paperId", "title", "abstract", "tldr", "openAccessPdf"],
        limit=15,
    )

In [7]:
system_instruction = """
You are a Computer Science PhD student. Your goal is to write review/survey papers in specific areas of Computer Science. You should be able to:
- Identify research problems and break them down into sub problems
- Conduct thorough literature review on your topic, summarize key findings and identify gaps in existing methodologies
- Formulate clear and testable hypotheses to address your research questions
- Develop experimental methodologies to test your hypotheses, considering factors such as data collection, analysis, and evaluation
- Collect, clean and analyze relevant data using appropriate tools and techniques
- Draw meaningful conclusions from your research findings and discuss their implications
- Prepare high-quality research papers that effectively communicate your findings
"""
model = genai.GenerativeModel(model_name="gemini-1.5-flash", system_instruction=system_instruction)

In [32]:
class Subproblem(TypedDict):
    prompt: str
    requires_internet: bool
    requires_previous_output: bool


topic = "Image Classification Techniques"
prompt = f"""
You are researching the below topic and need to write a survey paper on the same. Your current goal is only to research on the topic 
and not write anything currently.

Topic: {topic}

Instructions:
- Identify the key areas of focus within this topic and outline the subproblems that need to be addressed
- For each subproblem, create a concise prompt that states the task to be performed
- Indicate whether internet access is necessary to complete the subproblem. Assume that around 15-20 relevant research papers will be provided to you.
- Determine if the output of the previous subproblem is relevant to the subsequent subproblem
- Your first subproblem should always be a query string that can be used to find relevant research papers from the Semantic Scholar database

Do not generate the same instructions as your output. Ensure that you provide relevant subproblems that can be addressed by you.
Ensure that your output is in the correct format since it will be parsed automatically.
"""

gen_config = {
    "response_mime_type": "application/json",
    "response_schema": list[Subproblem]
}

response = model.generate_content(prompt, generation_config=gen_config)

In [36]:
subproblems = json.loads(response.text)
year = date.today().year
results = search_semantic_scholar(subproblems[0]["prompt"], (year - 10, year))

for item in results:
    print(item)

{'paperId': 'f55c3d53eaacc75f497a55ab349276b18ea98cc1', 'title': 'Fused Node-Level Residual Structure Edge Graph Neural Network for Few-Shot Image Classification', 'abstract': 'In spite of recent rapid developments across various computer vision domains, numerous cutting-edge deep learning algorithms often demand a substantial volume of data to operate effectively. Within this research, a novel few-shot learning approach is presented with the objective of enhancing the accuracy of few-shot image classification. This task entails the classification of unlabeled query samples based on a limited set of labeled support examples. Specifically, the integration of the edge-conditioned graph neural network (EGNN) framework with hierarchical node residual connections is proposed. The primary aim is to enhance the performance of graph neural networks when applied to few-shot classification, a rather unconventional application of hierarchical node residual structures in few-shot image classificat

KeyboardInterrupt: 

In [20]:


internet_prompt_template = """
{prompt}
- Based on the above details, create a relevant and concise search query to be used on Semantic Scholar
- Search the Semantic Scholar database for your query
- Determine the relevance (as percentage) of the returned papers based on the title, abstract, and tldr of each search result

{extra_context}
"""
sub_prompt_template = """
{prompt}

{extra_context}
"""

outputs = [None] * len(subproblems)
for i, subproblem in enumerate(subproblems):
    pprint(subproblem, width=120, indent=2)

    if subproblem["requires_internet"]:
        sub_prompt = internet_prompt_template.format(
            prompt=subproblem["prompt"],
            extra_context="" if not subproblem["requires_previous_output"] else f"Extra Context:\n{outputs[i - 1]}",
        )
    else:
        sub_prompt = sub_prompt_template.format(
            prompt=subproblem["prompt"],
            extra_context="" if not subproblem["requires_previous_output"] else f"Extra Context:\n{outputs[i - 1]}",
        )
    outputs[i] = model.generate_content(sub_prompt).text
    


{'prompt': 'Identify and categorize different TCP congestion control '
           'algorithms based on their underlying mechanisms and principles '
           '(e.g., slow start, congestion avoidance, fast retransmit, fast '
           'recovery).',
 'requires_internet': True,
 'requires_previous_output': False}
{'prompt': 'Analyze the strengths and weaknesses of each algorithm in terms of '
           'network performance metrics such as throughput, delay, fairness, '
           'and robustness to network conditions.',
 'requires_internet': True,
 'requires_previous_output': True}
{'prompt': 'Compare and contrast the performance of different algorithms under '
           'varying network conditions (e.g., bandwidth variations, packet '
           'losses, buffer sizes) using simulations or real-world experiments.',
 'requires_internet': True,
 'requires_previous_output': True}
{'prompt': 'Investigate the impact of emerging technologies such as 5G and '
           'edge computing on TC

In [21]:
for output in outputs:
    print(output)

```json
{
  "query": "tcp congestion control algorithms",
  "fields": ["title", "abstract", "venue", "authors", "year", "citations"],
  "citationContext": true,
  "limit": 100,
  "include_paper_id": true,
  "sort": {
    "citations": "desc"
  },
  "filter": {
    "publication_date": {
      "from": 2010,
      "to": 2023
    }
  }
}
```

**Explanation:**

* **`query`:** This sets the main search term as "tcp congestion control algorithms". 
* **`fields`:** This specifies the data fields to be retrieved for each paper, including title, abstract, venue, authors, publication year, citation count, and the paper ID.
* **`citationContext`:** This flag enables retrieving the citation context, providing insights into how the paper is referenced in other works.
* **`limit`:** This limits the search results to the top 100 most cited papers.
* **`include_paper_id`:** This ensures the paper ID is included in the response, which is useful for further analysis.
* **`sort`:** This sorts the results b