In [1]:
# Colab / local install cell
!pip install --quiet pdfminer.six PyPDF2 sentence-transformers rank_bm25 scikit-learn nltk gradio gensim
# Download NLTK stopwords (run in python cell)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import os
import json
import re
import glob
import tempfile
from pathlib import Path
from collections import defaultdict
import numpy as np
import math
from tqdm import tqdm

In [4]:
# Text / ML libraries
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [5]:
# PDF parsing
from pdfminer.high_level import extract_text as pdfminer_extract
from PyPDF2 import PdfReader

In [6]:
# UI
import gradio as gr

In [7]:
# NLP utilities
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:

# ------------------ Utilities ------------------
def simple_tokenize(text):
    tokens = re.findall(r'\b[a-zA-Z0-9\-]+\b', text.lower())
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]

def normalize_score(arr):
    arr = np.array(arr, dtype=float)
    if arr.size == 0:
        return arr
    minv, maxv = float(arr.min()), float(arr.max())
    if maxv - minv < 1e-12:
        return np.ones_like(arr)
    return (arr - minv) / (maxv - minv)

def extract_text_pdfminer(path):
    try:
        text = pdfminer_extract(path)
        return text or ""
    except Exception as e:
        print("pdfminer failed:", e)
        return ""

def extract_text_pypdf2(path):
    try:
        reader = PdfReader(path)
        pages = []
        for p in reader.pages:
            txt = p.extract_text()
            if txt:
                pages.append(txt)
        return "\n".join(pages)
    except Exception as e:
        print("PyPDF2 failed:", e)
        return ""

def extract_text_best(path):
    text = extract_text_pypdf2(path)
    if text and len(text.split()) > 40:
        return text
    text2 = extract_text_pdfminer(path)
    return text2 or text

In [9]:

# ------------------ Loading/Parsing dataset ------------------

def load_extracted_jsons(base_path='.'):
    """
    Attempts to load these files if they exist in working directory:
      - author_texts_pdfminer.json
      - authors_keywords.json
      - metadata.json
      - references_dataset.json

    Returns:
      papers: list of dicts {paper_id, author, title, text}
      authors_kw, metadata, references  (may be None)
    """
    def try_load(fname):
        p = Path(base_path) / fname
        if p.exists():
            with open(p, 'r', encoding='utf-8') as f:
                return json.load(f)
        return None

    author_texts = try_load('author_texts_pdfminer.json')
    authors_kw = try_load('authors_keywords.json')
    metadata = try_load('metadata.json')
    references = try_load('references_dataset.json')

    papers = []
    if author_texts:
        for author, paper_lists in author_texts.items():
            for i, paper_parts in enumerate(paper_lists):
                # paper_parts may be list of strings
                if isinstance(paper_parts, list):
                    text = " ".join([p for p in paper_parts if isinstance(p, str)])
                elif isinstance(paper_parts, str):
                    text = paper_parts
                else:
                    text = ""
                paper_id = f"{author.replace(' ','_')}__paper{i}"
                title = None
                if metadata and author in metadata:
                    try:
                        title = metadata[author][i].get('Paper')
                    except Exception:
                        title = None
                papers.append({'paper_id': paper_id, 'author': author, 'title': title, 'text': text})
    else:
        # Fallback: try to read dataset.zip or dataset folder structure: dataset/<author>/*.pdf
        dataset_dir = Path(base_path) / "dataset"
        if dataset_dir.exists():
            for author_dir in dataset_dir.iterdir():
                if author_dir.is_dir():
                    for pdf_path in author_dir.glob("*.pdf"):
                        text = extract_text_best(str(pdf_path))
                        pid = f"{author_dir.name.replace(' ','_')}__{pdf_path.stem}"
                        papers.append({'paper_id': pid, 'author': author_dir.name, 'title': pdf_path.name, 'text': text})
        else:
            raise FileNotFoundError("No author_texts_pdfminer.json found and dataset/ not present. Upload JSONs or dataset.zip to Colab.")

    return papers, authors_kw, metadata, references

In [11]:
# ------------------ Recommender Class ------------------

class ReviewerRecommender:
    def __init__(self, papers, embed_model_name='all-MiniLM-L6-v2', nmf_topics=50):
        """
        papers: list of dicts {'paper_id','author','title','text'}
        """
        self.papers = papers
        self.paper_ids = [p['paper_id'] for p in papers]
        self.paper_texts = [p['text'] for p in papers]
        self.authors = sorted(set([p['author'] for p in papers]))
        self.author_to_papers = defaultdict(list)
        for p in papers:
            self.author_to_papers[p['author']].append(p['paper_id'])

        # Tokenized texts for BM25 and exact
        self.tokenized = [simple_tokenize(t) for t in self.paper_texts]

        # TF-IDF (for cosine and for NMF)
        self.tfidf_vectorizer = TfidfVectorizer(token_pattern=r'\b[a-zA-Z0-9\-]+\b', lowercase=True, stop_words='english', max_features=50000)
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.paper_texts)

        # BM25
        self.bm25 = BM25Okapi(self.tokenized)

        # SentenceTransformer embeddings
        print("Loading embedding model (this may take a moment)...")
        self.embed_model = SentenceTransformer(embed_model_name)
        print("Computing document embeddings (normalized).")
        self.embeddings = self.embed_model.encode(self.paper_texts, show_progress_bar=True, batch_size=32, convert_to_numpy=True)
        # normalize for cosine via dot
        norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
        norms[norms == 0] = 1.0
        self.embeddings = self.embeddings / norms

        # Author embeddings (mean pooling)
        self.author_embeddings = {}
        for author, pids in self.author_to_papers.items():
            idxs = [self.paper_ids.index(pid) for pid in pids if pid in self.paper_ids]
            if len(idxs) == 0:
                self.author_embeddings[author] = np.zeros(self.embeddings.shape[1])
            else:
                self.author_embeddings[author] = np.mean(self.embeddings[idxs], axis=0)

        # Topic modeling (NMF on TF-IDF)
        try:
            print("Fitting NMF topic model (this may take a bit)...")
            self.nmf = NMF(n_components=min(nmf_topics, max(2, int(len(self.paper_texts)/5))), random_state=42, init='nndsvda', max_iter=400)
            self.nmf_doc_topics = self.nmf.fit_transform(self.tfidf_matrix)
        except Exception as e:
            print("NMF failed or skipped:", e)
            self.nmf = None
            self.nmf_doc_topics = None

    # ---------- scorers ----------
    def scorer_exact_token_overlap(self, query_text):
        q_tokens = set(simple_tokenize(query_text))
        scores = []
        for t in self.paper_texts:
            p_tokens = set(simple_tokenize(t))
            scores.append(len(q_tokens & p_tokens) / max(1.0, len(q_tokens)))
        return np.array(scores)

    def scorer_tfidf_cosine(self, query_text):
        q_vec = self.tfidf_vectorizer.transform([query_text])
        sims = cosine_similarity(q_vec, self.tfidf_matrix).flatten()
        return sims

    def scorer_bm25(self, query_text):
        q_tokens = simple_tokenize(query_text)
        scores = np.array(self.bm25.get_scores(q_tokens), dtype=float)
        return scores

    def scorer_semantic(self, query_text):
        q_emb = self.embed_model.encode([query_text], convert_to_numpy=True)[0]
        q_emb = q_emb / np.linalg.norm(q_emb) if np.linalg.norm(q_emb) != 0 else q_emb
        sims = (self.embeddings @ q_emb).astype(float)
        return sims

    def scorer_jaccard(self, query_text, ngram=1):
        def ngram_set(s, n=1):
            tokens = re.findall(r'\b[a-zA-Z0-9\-]+\b', s.lower())
            if n == 1:
                return set([t for t in tokens if t not in STOPWORDS])
            return set(zip(*[tokens[i:] for i in range(n)]))
        qset = ngram_set(query_text, ngram)
        scores = []
        for t in self.paper_texts:
            pset = ngram_set(t, ngram)
            denom = len(qset | pset)
            scores.append(len(qset & pset) / denom if denom > 0 else 0.0)
        return np.array(scores)

    def scorer_nmf_cosine(self, query_text):
        if self.nmf is None or self.nmf_doc_topics is None:
            return np.zeros(len(self.paper_texts))
        q_vec = self.tfidf_vectorizer.transform([query_text])
        q_topic = self.nmf.transform(q_vec)
        sims = cosine_similarity(q_topic, self.nmf_doc_topics).flatten()
        return sims

    # ---------- fusion & aggregation ----------
    def combine_bm25_semantic(self, bm25_scores, semantic_scores, bm25_weight=0.6):
        b = normalize_score(bm25_scores)
        s = normalize_score(semantic_scores)
        return bm25_weight * b + (1.0 - bm25_weight) * s

    def reciprocal_rank_fusion(self, list_of_score_arrays, k_rrf=60):
        n_docs = len(self.paper_texts)
        rrf_scores = np.zeros(n_docs, dtype=float)
        for scores in list_of_score_arrays:
            # rank: 1 is best
            ranks = (-np.array(scores)).argsort().argsort() + 1
            rrf_scores += 1.0 / (k_rrf + ranks)
        return rrf_scores

    def aggregate_paper_scores_to_author(self, paper_scores, agg='max'):
        author_scores = {}
        for author in self.authors:
            pids = self.author_to_papers[author]
            idxs = [self.paper_ids.index(pid) for pid in pids if pid in self.paper_ids]
            if not idxs:
                author_scores[author] = 0.0
            else:
                if agg == 'max':
                    author_scores[author] = float(np.max(paper_scores[idxs]))
                elif agg == 'mean':
                    author_scores[author] = float(np.mean(paper_scores[idxs]))
                else:
                    author_scores[author] = float(np.max(paper_scores[idxs]))
        return author_scores

    # ---------- API ----------
    def recommend_authors(self, query_text, k=5, weights=None, bm25_weight=0.6, agg='max', use_rrf=True):
        # compute scorers
        s_exact = self.scorer_exact_token_overlap(query_text)
        s_tfidf = self.scorer_tfidf_cosine(query_text)
        s_bm25 = self.scorer_bm25(query_text)
        s_sem = self.scorer_semantic(query_text)
        s_jaccard = self.scorer_jaccard(query_text, ngram=1)
        s_nmf = self.scorer_nmf_cosine(query_text)

        # internal BM25+semantic combo
        combo_bm_sem = self.combine_bm25_semantic(s_bm25, s_sem, bm25_weight=bm25_weight)

        # Prepare scorers list for fusion
        scorer_list = [combo_bm_sem, s_tfidf, s_exact, s_jaccard]
        # include nmf if available
        if s_nmf is not None and np.any(s_nmf):
            scorer_list.append(s_nmf)

        if weights:
            # weighted normalized sum
            named = {
                'exact': s_exact, 'tfidf': s_tfidf, 'bm25_semantic': combo_bm_sem,
                'jaccard': s_jaccard, 'nmf': s_nmf, 'semantic': s_sem
            }
            arrs = []
            wsum = 0.0
            for name, w in weights.items():
                if w <= 0:
                    continue
                sc = named.get(name)
                if sc is None:
                    continue
                arrs.append(w * normalize_score(sc))
                wsum += w
            if wsum == 0:
                paper_scores = np.zeros(len(self.paper_texts))
            else:
                paper_scores = sum(arrs) / wsum
        else:
            # use RRF across scorers
            if use_rrf:
                paper_scores = self.reciprocal_rank_fusion(scorer_list, k_rrf=60)
            else:
                paper_scores = np.mean([normalize_score(s) for s in scorer_list], axis=0)

        # author aggregation
        author_scores = self.aggregate_paper_scores_to_author(paper_scores, agg=agg)
        sorted_auths = sorted(author_scores.items(), key=lambda x: x[1], reverse=True)
        # return top-k authors and paper-level scores for explainability
        return sorted_auths[:k], paper_scores

    def explain_author(self, author, paper_scores, top_n=1):
        pids = self.author_to_papers.get(author, [])
        idxs = [self.paper_ids.index(pid) for pid in pids if pid in self.paper_ids]
        sorted_idx = sorted(idxs, key=lambda i: paper_scores[i], reverse=True)
        out = []
        for i in sorted_idx[:top_n]:
            out.append({'paper_id': self.paper_ids[i], 'score': float(paper_scores[i]), 'snippet': self.paper_texts[i][:600]})
        return out



In [12]:
# ------------------ Build the system from your JSONs or dataset ------------------

print("Loading dataset (author_texts_pdfminer.json or dataset/ folder expected in working dir)...")
papers, authors_kw, metadata, references = load_extracted_jsons('.')
print(f"Loaded {len(papers)} papers from {len(set([p['author'] for p in papers]))} authors.")

print("Building recommender (this will compute embeddings and topic model if possible).")
recommender = ReviewerRecommender(papers, embed_model_name='all-MiniLM-L6-v2', nmf_topics=40)
print("Recommender ready.")

Loading dataset (author_texts_pdfminer.json or dataset/ folder expected in working dir)...
Loaded 629 papers from 71 authors.
Building recommender (this will compute embeddings and topic model if possible).
Loading embedding model (this may take a moment)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing document embeddings (normalized).


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Fitting NMF topic model (this may take a bit)...
Recommender ready.


In [None]:
# ------------------ Gradio UI: upload PDF and get top-5 reviewers ------------------

def process_uploaded_pdf_and_recommend(uploaded_file, top_k=5):
    """
    uploaded_file: a file-like object from Gradio (temp file path)
    returns: structured results to display in UI
    """
    # Save uploaded file to temp path
    if hasattr(uploaded_file, "name"):
        tmp_path = uploaded_file.name
    else:
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        tmp.write(uploaded_file.read())
        tmp.flush()
        tmp_path = tmp.name

    query_text = extract_text_best(tmp_path)
    if not query_text or len(query_text.split()) < 10:
        return {"error": "Failed to extract text from PDF. Try a different parser or check the file."}

    # Run recommendation
    topk_authors, paper_scores = recommender.recommend_authors(query_text, k=top_k)
    # Build explanations
    results = []
    for author, score in topk_authors:
        expl = recommender.explain_author(author, paper_scores, top_n=1)
        snippet = expl[0]['snippet'] if expl else ""
        pid = expl[0]['paper_id'] if expl else ""
        results.append({
            'author': author,
            'score': float(score),
            'top_paper_id': pid,
            'top_paper_snippet': snippet
        })
    return {"query_preview": query_text[:1200], "results": results}


# Gradio interface function to adapt outputs to display
def gradio_recommend(uploaded_file):
    out = process_uploaded_pdf_and_recommend(uploaded_file, top_k=5)
    if "error" in out:
        return "Error: " + out["error"], ""
    # format textual output: query preview and top5 list
    preview = out["query_preview"]
    rows = []
    for r in out["results"]:
        rows.append(f"Author: {r['author']}\nScore: {r['score']:.4f}\nTop paper: {r['top_paper_id']}\nSnippet: {r['top_paper_snippet']}\n---")
    return preview, "\n\n".join(rows)


# Launch Gradio app
title = "Reviewer Recommendation — Upload PDF to get Top-5 Reviewers"
description = "Upload a PDF (invited paper). The system extracts text, computes lexical + semantic similarities to the corpus, and returns top-5 candidate reviewers (author names) with short explanations."

iface = gr.Interface(fn=gradio_recommend,
                     inputs=gr.File(label="Upload PDF (single file)"),
                     outputs=[gr.Textbox(label="Extracted text (preview)"), gr.Textbox(label="Top-5 reviewers & explanation")],
                     title=title,
                     description=description,
                     allow_flagging="never",
                     examples=None)

print("Launching Gradio demo. If using Colab, click the public link in output (it can take a few seconds to spin up).")
iface.launch(share=True, debug=True)



Launching Gradio demo. If using Colab, click the public link in output (it can take a few seconds to spin up).
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://74cf3193dbdeb05e20.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
