In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.cluster 1  import KMeans

# # ... (existing code for searching papers)

# # Feature extraction with TF-IDF
# vectorizer = TfidfVectorizer()
# features = vectorizer.fit_transform([result["title"] + " " + result["abstract"] + " " + result["tldr"] for result in results])

# # Similarity matrix
# similarity_matrix = cosine_similarity(features)

# # Clustering with KMeans (adjust number of clusters as needed)
# kmeans = KMeans(n_clusters=3)
# kmeans.fit(features)

# # Assign each paper to a cluster
# clusters = kmeans.labels_

# # Print paper information with assigned cluster
# for i, result in enumerate(results):
#   print(f"Paper: {result['title']}, Cluster: {clusters[i]}")

In [12]:
from datetime import date
from pprint import pprint
from typing_extensions import TypedDict
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from dotenv import find_dotenv, load_dotenv
from semanticscholar import SemanticScholar
import google.generativeai as genai
from google.oauth2 import service_account
import os

In [13]:
load_dotenv(dotenv_path="API_key.env")
service_account_key = os.getenv("GEMINI_API_KEY")
credentials = service_account.Credentials.from_service_account_file(service_account_key)
genai.configure(credentials=credentials)


In [17]:
sch = SemanticScholar(timeout=15)


def search_semantic_scholar(query: str, year: tuple[int, int]) -> list:
    """Searches Semantic Scholar for papers with retry logic."""
    return sch.search_paper(
        query=query,
        year=f"{year[0]}-{year[1]}",
        open_access_pdf=True,
        fields_of_study=["Computer Science"],
        fields=["paperId", "title", "abstract", "tldr", "openAccessPdf"],
        limit=15,
    )

In [18]:
def extract_techniques(paper):
    """Extracts techniques from a paper's title, abstract, and TLDR."""
    text = paper["title"] + " " + paper["abstract"] + " " + paper["tldr"]
    # Use NLP techniques (e.g., named entity recognition, keyword extraction) to extract techniques
    techniques = extract_keywords(text)  # Replace with your keyword extraction function
    return techniques

def cluster_papers(papers):
    """Clusters papers based on extracted techniques."""
    texts = [extract_techniques(paper) for paper in papers]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Use KMeans clustering with 4 clusters (adjust as needed)

    kmeans = KMeans(n_clusters=4, random_state=42)
    kmeans.fit(tfidf_matrix)
    cluster_labels = kmeans.labels_

    # Organize papers into clusters based on labels
    
    categories = {i: [] for i in range(4)}
    for paper, label in zip(papers, cluster_labels):
        categories[label].append(paper)

    return categories

In [19]:
system_instruction = """
You are a Computer Science PhD student. Your goal is to write review/survey papers in specific areas of Computer Science. You should be able to:
- Identify research problems and break them down into sub problems
- Conduct thorough literature review on your topic, summarize key findings and identify gaps in existing methodologies
- Formulate clear and testable hypotheses to address your research questions
- Develop experimental methodologies to test your hypotheses, considering factors such as data collection, analysis, and evaluation
- Collect, clean and analyze relevant data using appropriate tools and techniques
- Draw meaningful conclusions from your research findings and discuss their implications
- Prepare high-quality research papers that effectively communicate your findings
"""
model = genai.GenerativeModel(model_name="gemini-1.5-flash", system_instruction=system_instruction)

In [20]:
class Subproblem(TypedDict):
    prompt: str
    requires_internet: bool
    requires_previous_output: bool


topic = "Augmented reality"
prompt = f"""
You are researching the below topic and need to write a survey paper on the same. Your current goal is only to research on the topic 
and not write anything currently.

Topic: {topic}

Instructions:
- Identify the key areas of focus within this topic and outline the subproblems that need to be addressed
- For each subproblem, create a concise prompt that states the task to be performed
- Indicate whether internet access is necessary to complete the subproblem. Assume that around 15-20 relevant research papers will be provided to you.
- Determine if the output of the previous subproblem is relevant to the subsequent subproblem
- Your first subproblem should always be a query string that can be used to find relevant research papers from the Semantic Scholar database

Do not generate the same instructions as your output. Ensure that you provide relevant subproblems that can be addressed by you.
Ensure that your output is in the correct format since it will be parsed automatically.
"""

gen_config = {
    "response_mime_type": "application/json",
    "response_schema": list[Subproblem]
}

response = model.generate_content(prompt, generation_config=gen_config)

In [21]:
subproblems = json.loads(response.text)
year = date.today().year

results = search_semantic_scholar(subproblems[0]["prompt"], (year - 10, year))

max_results = 15  # Adjust this value based on expected results


# Categorize papers
categories = cluster_papers(results)

# Print results
for category, papers in categories.items():
    print(f"Category: {category}")
    for paper in papers:
        print(f"- {paper['title']}")

        
# for i, item in enumerate(results):
#     if i >= len(results) or i >= max_results:
#         break  # Exit when either all results are printed or the limit is reached
#     print(item)


RetryError: RetryError[<Future at 0x23cdce96690 state=finished raised ConnectionRefusedError>]