In [1]:
import requests
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
papers_df = pd.read_csv("papers_and_authors.csv")
papers_df.columns = [c.strip() for c in papers_df.columns]

venues = (
    papers_df["venue"]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
)

print(f"Found {len(venues)} unique venues")


Found 147 unique venues


In [3]:
API_KEY = 'qNb1kj6alf9Ttx6AmfnF28Pz63OSKR4h5MyDgR6k'

HEADERS = {
    "x-api-key": API_KEY
}

In [4]:
def fetch_semantic_scholar_texts(
    venue,
    max_papers=200,
    sleep_time=0.5
):
    """
    Fetch titles + abstracts for papers related to a venue
    using Semantic Scholar Graph API with API key.
    """
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    offset = 0
    limit = 100

    texts = []
    seen_ids = set()

    while len(texts) < max_papers:
        params = {
            "query": venue,
            "limit": limit,
            "offset": offset,
            "fields": "paperId,title,abstract"
        }

        r = requests.get(url, headers=HEADERS, params=params)

        if r.status_code != 200:
            print(f"  ❌ Request failed ({r.status_code})")
            break

        papers = r.json().get("data", [])
        if not papers:
            break

        for p in papers:
            pid = p.get("paperId")
            if pid in seen_ids:
                continue
            seen_ids.add(pid)

            title = p.get("title", "")
            abstract = p.get("abstract", "")

            text = f"{title} {abstract}".strip()
            if len(text) > 50:
                texts.append(text)

            if len(texts) >= max_papers:
                break

        offset += limit
        time.sleep(sleep_time)

    return texts


In [5]:
ACADEMIC_STOPWORDS = {
    "such","have","been","using","used","based","present","recent",
    "recently","various","different","approach","methods","results",
    "paper","study","studies","proposed","however","show","shows",
    "research","method","problem","system"
}


In [6]:
def extract_venue_keywords_semantic_scholar(
    venues,
    max_papers_per_venue=20,
    top_k=10
):
    rows = []

    for venue in venues:
        if venue.lower() in ["unknown", "nan", "arxiv", "arxiv.org"]:
            continue

        print(f"Processing venue: {venue}")

        texts = fetch_semantic_scholar_texts(
            venue,
            max_papers=max_papers_per_venue
        )

        if len(texts) < 10:
            print("  ❌ Insufficient papers")
            continue

        vectorizer = TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=3000
        )

        tfidf = vectorizer.fit_transform(texts)
        mean_scores = np.asarray(tfidf.mean(axis=0)).flatten()
        features = np.array(vectorizer.get_feature_names_out())

        # Remove academic filler words
        mask = [f not in ACADEMIC_STOPWORDS for f in features]
        features = features[mask]
        mean_scores = mean_scores[mask]

        top_idx = mean_scores.argsort()[::-1][:top_k]

        for i in top_idx:
            rows.append({
                "Venue": venue,
                "Keyword": features[i],
                "Weight": round(float(mean_scores[i]), 5)
            })

    return pd.DataFrame(rows)


In [7]:
venue_keywords_df = extract_venue_keywords_semantic_scholar(
    venues,
    max_papers_per_venue=20,
    top_k=10
)

venue_keywords_df.to_csv(
    "venue_common_keywords.csv",
    index=False
)

print("\nSaved venue keywords to venue_common_keywords.csv")


Processing venue: Neural Information Processing Systems
Processing venue: International Conference on Learning Representations
Processing venue: International Conference on Machine Learning
Processing venue: Conference on Empirical Methods in Natural Language Processing
Processing venue: Computer Vision and Pattern Recognition
Processing venue: Conference of the European Chapter of the Association for Computational Linguistics
Processing venue: Transactions of the Association for Computational Linguistics
Processing venue: North American Chapter of the Association for Computational Linguistics
Processing venue: Annual Meeting of the Association for Computational Linguistics
Processing venue: Neural Computation
Processing venue: International Conference on Computational Logic
Processing venue: Journal of machine learning research
Processing venue: Reliability Engineering & System Safety
Processing venue: Information Fusion
Processing venue: Biomedical Signal Processing and Control
Proce