In [1]:
%pip install pandas requests nltk scikit-learn transformers sentence-transformers wordcloud matplotlib openpyxl

Collecting wordcloud
  Downloading wordcloud-1.9.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading wordcloud-1.9.4-cp311-cp311-macosx_11_0_arm64.whl (167 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.6/167.6 kB[0m [31m534.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m246.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl, wordcloud
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5 wordcloud-1.9.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m

In [2]:
import pandas as pd
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os
from openpyxl import Workbook
import uuid

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Configuration
SERPAPI_KEY = "your_serpapi_key_here"  # Replace with your SerpAPI key
RESEARCHERS = [
    "Chen Jia", "Aman Madaan", "Yunmo Chen", "Nikolay Bogoychev", "Shuhuai Ren",
    "Tunga Gungor", "Sheng Shen", "Xiang Li0", "Hao Tang", "Zhixuan Zhou",
    "Wei Bi", "Da Yin", "Monjoy Saha", "Kaushal Kumar Maurya", "Thang Vu",
    "John Ortega", "Irina Temnikova", "Nan Jiang", "Rujun Han"
]  # Your assigned researchers (excluding duplicates)
OUTPUT_DIR = "researcher_profiles"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
keyword_extractor = pipeline("token-classification", model="dslim/bert-base-NER")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def fetch_scholar_data(author_name):
    """Fetch 20 recent publications from Google Scholar using SerpAPI."""
    params = {
        "engine": "google_scholar",
        "q": f"author:{author_name}",
        "api_key": SERPAPI_KEY,
        "num": 20
    }
    response = requests.get("https://serpapi.com/search", params=params)
    if response.status_code != 200:
        print(f"Error fetching data for {author_name}")
        return []
    
    data = response.json()
    publications = data.get("organic_results", [])
    results = []
    for pub in publications[:20]:
        title = pub.get("title", "N/A")
        abstract = pub.get("snippet", "N/A")  # Note: SerpAPI may not always provide full abstracts
        results.append({"title": title, "abstract": abstract})
    return results

def extract_keywords(text):
    """Extract keywords using TF-IDF and BERT NER."""
    # TF-IDF keywords
    tfidf_matrix = tfidf_vectorizer.fit_transform([text])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    tfidf_keywords = sorted([(feature_names[i], tfidf_scores[i]) for i in range(len(feature_names))], key=lambda x: x[1], reverse=True)[:10]
    
    # BERT NER for entities
    entities = keyword_extractor(text)
    ner_keywords = [entity['word'] for entity in entities if entity['entity'].startswith('B-') and entity['word'].lower() not in stop_words]
    
    return list(set([kw[0] for kw in tfidf_keywords] + ner_keywords))

def compute_diversity(abstracts):
    """Compute research diversity using sentence embeddings."""
    if len(abstracts) < 2:
        return 0.0
    embeddings = sentence_model.encode(abstracts, convert_to_tensor=True)
    similarity_scores = []
    for i in range(len(abstracts)):
        for j in range(i + 1, len(abstracts)):
            sim = util.cos_sim(embeddings[i], embeddings[j]).item()
            similarity_scores.append(sim)
    return sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0.0

def generate_wordcloud(text, filename):
    """Generate and save a Word Cloud."""
    wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(filename)
    plt.close()

def process_researcher(researcher):
    """Process a single researcher's data."""
    print(f"Processing {researcher}...")
    publications = fetch_scholar_data(researcher)
    
    # Create Excel sheet for researcher
    df = pd.DataFrame({
        "S.No": range(1, len(publications) + 1),
        "Researcher Name": [researcher] * len(publications),
        "Title of the Paper": [pub["title"] for pub in publications],
        "Abstract": [pub["abstract"] for pub in publications]
    })
    df.to_excel(f"{OUTPUT_DIR}/{researcher.replace(' ', '_')}.xlsx", index=False)
    
    # Combine abstracts for analysis
    combined_abstracts = " ".join([pub["abstract"] for pub in publications if pub["abstract"] != "N/A"])
    if not combined_abstracts:
        return None, None, None
    
    # Extract research themes
    keywords = extract_keywords(combined_abstracts)
    themes = ", ".join(keywords[:5])  # Top 5 themes
    
    # Compute diversity
    abstracts = [pub["abstract"] for pub in publications if pub["abstract"] != "N/A"]
    avg_similarity = compute_diversity(abstracts)
    diversity_score = "High" if avg_similarity < 0.4 else "Medium" if avg_similarity < 0.7 else "Low"
    
    # Generate Word Cloud
    wordcloud_path = f"{OUTPUT_DIR}/{researcher.replace(' ', '_')}_wordcloud.png"
    generate_wordcloud(combined_abstracts, wordcloud_path)
    
    return themes, avg_similarity, diversity_score

def main():
    """Main function to process all researchers and generate outputs."""
    profile_data = []
    diversity_data = []
    
    for researcher in RESEARCHERS:
        themes, avg_similarity, diversity_score = process_researcher(researcher)
        if themes:
            profile_data.append({"Researcher": researcher, "Top Research Themes": themes})
            diversity_data.append({"Researcher": researcher, "Average Similarity": round(avg_similarity, 2), "Diversity Score": diversity_score})
    
    # Create summary Excel file
    with pd.ExcelWriter("Researcher_Analysis.xlsx") as writer:
        # Write individual researcher sheets (already saved)
        # Write Author_Profiles summary
        pd.DataFrame(profile_data).to_excel(writer, sheet_name="Author_Profiles", index=False)
        # Write Author_Diversity summary
        pd.DataFrame(diversity_data).to_excel(writer, sheet_name="Author_Diversity", index=False)
    
    print("Analysis complete. Outputs saved in 'researcher_profiles' directory and 'Researcher_Analysis.xlsx'.")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fenilvadher/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fenilvadher/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to

Processing Chen Jia...
Error fetching data for Chen Jia
Processing Aman Madaan...
Error fetching data for Aman Madaan
Processing Yunmo Chen...
Error fetching data for Yunmo Chen
Processing Nikolay Bogoychev...
Error fetching data for Nikolay Bogoychev
Processing Shuhuai Ren...
Error fetching data for Shuhuai Ren
Processing Tunga Gungor...
Error fetching data for Tunga Gungor
Processing Sheng Shen...
Error fetching data for Sheng Shen
Processing Xiang Li0...
Error fetching data for Xiang Li0
Processing Hao Tang...
Error fetching data for Hao Tang
Processing Zhixuan Zhou...
Error fetching data for Zhixuan Zhou
Processing Wei Bi...
Error fetching data for Wei Bi
Processing Da Yin...
Error fetching data for Da Yin
Processing Monjoy Saha...
Error fetching data for Monjoy Saha
Processing Kaushal Kumar Maurya...
Error fetching data for Kaushal Kumar Maurya
Processing Thang Vu...
Error fetching data for Thang Vu
Processing John Ortega...
Error fetching data for John Ortega
Processing Irina Tem