In [1]:
from datetime import date
from pprint import pprint
from typing_extensions import TypedDict
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

from dotenv import find_dotenv, load_dotenv
from semanticscholar import SemanticScholar
import google.generativeai as genai
from google.oauth2 import service_account
import os

In [2]:
load_dotenv(dotenv_path="API_key.env")
service_account_key = os.getenv("GEMINI_API_KEY")
credentials = service_account.Credentials.from_service_account_file(service_account_key)
genai.configure(credentials=credentials)


In [21]:
sch = SemanticScholar(timeout=15)
from tenacity import retry, wait_exponential, stop_after_attempt


@retry(wait=wait_fixed(5), stop=stop_after_attempt(5))

def search_semantic_scholar(query: str, year: tuple[int, int]) -> list:
    return sch.search_paper(
        query=query, 
        year=f"{year[0]}-{year[1]}", 
        open_access_pdf=True,
        fields_of_study=["Computer Science"], 
        fields=["paperId", "title", "abstract", "tldr", "openAccessPdf"],
        limit=15,
    )

In [22]:
system_instruction = """
You are a Computer Science PhD student. Your goal is to write review/survey papers in specific areas of Computer Science. You should be able to:
- Identify research problems and break them down into sub problems
- Conduct thorough literature review on your topic, summarize key findings and identify gaps in existing methodologies
- Formulate clear and testable hypotheses to address your research questions
- Develop experimental methodologies to test your hypotheses, considering factors such as data collection, analysis, and evaluation
- Collect, clean and analyze relevant data using appropriate tools and techniques
- Draw meaningful conclusions from your research findings and discuss their implications
- Prepare high-quality research papers that effectively communicate your findings
"""
model = genai.GenerativeModel(model_name="gemini-1.5-flash", system_instruction=system_instruction)

In [23]:
# class Subproblem(TypedDict):
#     prompt: str
#     requires_internet: bool
#     requires_previous_output: bool


topic = "Image classification"
prompt = f"""
You are researching the below topic and need to write a survey paper on the same. Your current goal is only to research on the topic 
and not write anything currently.

Topic: {topic}

Instructions:
- Identify the key areas of focus within this topic and outline the subproblems that need to be addressed
- For each subproblem, create a concise prompt that states the task to be performed
- Indicate whether internet access is necessary to complete the subproblem. Assume that around 15-20 relevant research papers will be provided to you.
- Determine if the output of the previous subproblem is relevant to the subsequent subproblem
- Your first subproblem should always be a query string that can be used to find relevant research papers from the Semantic Scholar database

Do not generate the same instructions as your output. Ensure that you provide relevant subproblems that can be addressed by you.
Ensure that your output is in the correct format since it will be parsed automatically.
"""

# Define a simplified schema manually
gen_config = {
    "response_mime_type": "application/json",
    "response_schema": {
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "prompt": {"type": "string"},
                "requires_internet": {"type": "boolean"},
                "requires_previous_output": {"type": "boolean"}
            },
            "required": ["prompt", "requires_internet", "requires_previous_output"]
        }
    }
}

# Generate the response using the model
response = model.generate_content(prompt, generation_config=gen_config)




In [27]:
# import time
# subproblems = json.loads(response.text)
# year = date.today().year

# results = search_semantic_scholar(subproblems[0]["prompt"], (year - 10, year))

# max_results = 15  # Adjust this value based on expected results

# for i, item in enumerate(results):
#     if i >= len(results) or i >= max_results:
#         break  # Exit when either all results are printed or the limit is reached
#     print(item)

#     time.sleep(1)


import json
import time
from datetime import date
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd

# Assuming search_semantic_scholar is defined and retrieves papers based on a prompt
# subproblems = json.loads(response.text)
year = date.today().year

subproblems = json.loads(response.text)  
results = search_semantic_scholar(subproblems[0]["prompt"], (year - 10, year))

max_results = 15  # Adjust this value based on expected results

# Store the paper titles and abstracts
papers = []

for i, item in enumerate(results):
    if i >= max_results or i >= len(results):
        break  # Exit when the limit is reached
    title = item.title if hasattr(item, 'title') else "No Title"
    abstract = item.abstract if hasattr(item, 'abstract') else "No Abstract"
    papers.append(f"{title} {abstract}")  # Combine title and abstract for better context

    print(title)  # Optionally print the title
    time.sleep(1)

# Clustering the papers based on techniques extracted from titles and abstracts
def extract_techniques(documents):
    # Placeholder for a more sophisticated technique extraction
    techniques = set()
    for doc in documents:
        words = doc.split()
        for word in words:
            if word.lower() in ['technique', 'method', 'approach', 'algorithm', 'framework']:
                techniques.add(word)
    return list(techniques)

# Vectorize the papers
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(papers)

# Perform K-Means clustering
num_clusters = 3  # You can adjust this based on your dataset
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Create a DataFrame to hold papers and their cluster labels
df = pd.DataFrame({'Paper': papers, 'Cluster': kmeans.labels_})

# Output the clustered papers
print(df)

# Optionally, you can print papers under each cluster
for cluster in range(num_clusters):
    print(f"\nCluster {cluster}:")
    cluster_papers = df[df['Cluster'] == cluster]['Paper'].tolist()
    for paper in cluster_papers:
        print(f"- {paper}")



Clothing Genre Recognition System Using Image Processing Techniques- A Survey
Survey Towards Android Application for Plant Disease Detection using Deep Learning Approach
A REVIEW ON CONTENT BASED IMAGE RETRIEVAL
A Melting Pot of Evolution and Learning
Multi-Label Active Learning Algorithms for Image Classification
Survey on Clustering Techniques for Image Categorization Dataset
Content-based image retrieval for fabric images: A survey
Deep Active Learning in the Presence of Label Noise: A Survey
A survey on sentiment classification algorithms, challenges and applications
Deep CNN and Deep GAN in Computational Visual Perception-Driven Image Analysis
Predicting Survival in Patients with Brain Tumors: Current State-of-the-Art of AI Methods Applied to MRI
Special issue on real-time image and video processing in mobile embedded systems
Image Retrieval: Modelling Keywords via Low-level Features
Active Learning to Assist Annotation of Aerial Images in Environmental Surveys
gamma-sky.net: Port

In [29]:
# import spacy
# # Load a pre-trained NER model
# nlp = spacy.load("en_core_sci_sm")

# papers = [result["title"] + " " + result["abstract"] + " " + result["tldr"] for result in results]

# techniques = []
# for paper in papers:
#     doc = nlp(paper)
#     techniques_in_paper = [ent.text for ent in doc.ents if ent.label_ == "TECHNIQUE"]
#     techniques.append(" ".join(techniques_in_paper))
# Get the cluster centroids
centroids = kmeans.cluster_centers_

# Get the feature names (techniques/words)
terms = vectorizer.get_feature_names_out()

# For each cluster, print the top terms (techniques) contributing to that cluster
num_top_techniques = 5  # Number of top techniques to display for each cluster

for i in range(num_clusters):
    print(f"\nCluster {i}:")
    
    # Sort the terms based on their importance in the cluster centroid
    sorted_tech_indices = centroids[i].argsort()[::-1][:num_top_techniques]
    
    # Print the top terms (techniques) for this cluster
    top_techniques = [terms[idx] for idx in sorted_tech_indices]
    print("Words that are matched the most:", ", ".join(top_techniques))



Cluster 0:
Words that are matched the most: image, retrieval, cbir, content, based

Cluster 1:
Words that are matched the most: learning, deep, active, label, classification

Cluster 2:
Words that are matched the most: plant, disease, application, android, care
