In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully!")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded successfully!


In [3]:
import streamlit as st
import PyPDF2
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain

# -----------------------
# Load Pre-Extracted Data
# -----------------------
# Paths to your JSON files
TEXT_JSON = r"extracted_data/author_texts_pdfminer.json"
KEYWORDS_JSON = r"extracted_data/authors_keywords.json"
REFERENCES_JSON = r"extracted_data\references_dataset.json"

with open(TEXT_JSON, 'r', encoding='utf-8') as f:
    authors_texts = json.load(f)

with open(KEYWORDS_JSON, 'r', encoding='utf-8') as f:
    authors_keywords = json.load(f)

with open(REFERENCES_JSON, 'r', encoding='utf-8') as f:
    authors_references = json.load(f)

# -----------------------
# Initialize BERT Model
# -----------------------
st.info("Loading sentence transformer model (BERT)...")
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# -----------------------
# Streamlit Interface
# -----------------------
st.title("Reviewer Recommendation System")
uploaded_file = st.file_uploader("Upload your research paper (PDF)", type="pdf")

if uploaded_file:
    # 1️⃣ Extract text from PDF
    reader = PyPDF2.PdfReader(uploaded_file)
    input_text = ""
    for page in reader.pages:
        input_text += page.extract_text() or ""

    st.success("PDF uploaded successfully!")

    # -----------------------
    # 2️⃣ Keyword Extraction (Simple approach: top-n frequent words)
    # You can replace with RAKE / KeyBERT for better extraction
    # -----------------------
    import re
    from collections import Counter

    words = re.findall(r'\b\w+\b', input_text.lower())
    word_counts = Counter(words)
    input_keywords = [w for w, _ in word_counts.most_common(20)]  # top 20 words

    # -----------------------
    # 3️⃣ Reference Extraction (Assuming references in last 10% of PDF)
    # -----------------------
    total_pages = len(reader.pages)
    ref_text = ""
    for page in reader.pages[int(0.9 * total_pages):]:
        ref_text += page.extract_text() or ""
    input_references = re.findall(r'\b\d{4}\b|[A-Z][a-z]+ et al\.', ref_text)  # crude extraction

    # -----------------------
    # 4️⃣ Topic Modeling (LDA)
    # -----------------------
    count_vectorizer = CountVectorizer(stop_words='english')
    input_count = count_vectorizer.fit_transform([input_text])
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    input_topic_dist = lda.fit_transform(input_count)[0]

    # -----------------------
    # 5️⃣ BERT Embedding
    # -----------------------
    input_emb = bert_model.encode([input_text])[0]

    # -----------------------
    # 6️⃣ Compute Similarities per Author
    # -----------------------
    final_scores = {}

    for author, papers in authors_texts.items():
        # ---- Text Similarity (TF-IDF) ----
        corpus = papers + [input_text]
        tfidf_vec = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf_vec.fit_transform(corpus)
        text_sims = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
        text_sim_score = np.mean(text_sims)

        # ---- Keyword Similarity (Jaccard) ----
        author_keywords_flat = list(chain.from_iterable(authors_keywords.get(author, [])))
        intersection = len(set(input_keywords) & set(author_keywords_flat))
        union = len(set(input_keywords) | set(author_keywords_flat))
        keyword_score = intersection / union if union > 0 else 0

        # ---- Reference Similarity ----
        author_refs_flat = list(chain.from_iterable(authors_references.get(author, {}).values()))
        intersection = len(set(input_references) & set(author_refs_flat))
        union = len(set(input_references) | set(author_refs_flat))
        ref_score = intersection / union if union > 0 else 0

        # ---- Topic Similarity ----
        # Average topic distribution of author papers (LDA on each paper)
        author_topic_dists = []
        for paper_text in papers:
            count_vec = count_vectorizer.fit_transform([paper_text])
            topic_dist = lda.transform(count_vec)[0]
            author_topic_dists.append(topic_dist)
        if author_topic_dists:
            avg_author_topic = np.mean(author_topic_dists, axis=0)
            topic_score = cosine_similarity([input_topic_dist], [avg_author_topic])[0][0]
        else:
            topic_score = 0

        # ---- BERT Embedding Similarity ----
        author_embs = [bert_model.encode([p])[0] for p in papers]
        bert_sims = [cosine_similarity([input_emb], [emb])[0][0] for emb in author_embs]
        bert_score = np.mean(bert_sims) if bert_sims else 0

        # ---- Aggregate Final Score ----
        final_score = np.mean([text_sim_score, keyword_score, ref_score, topic_score, bert_score])
        final_scores[author] = final_score

    # -----------------------
    # 7️⃣ Display Top-k Authors
    # -----------------------
    top_k = 5
    sorted_authors = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

    st.subheader(f"Top {top_k} Recommended Reviewers")
    for i, (author, score) in enumerate(sorted_authors, 1):
        st.write(f"{i}. {author} - Score: {score:.4f}")


2025-10-22 22:55:51.362 
  command:

    streamlit run c:\Users\BHUVANA VIJAYA\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
