In [14]:
!pip install pdfminer.six


Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [2]:
import zipfile
import os

# Path to the ZIP file in Colab (you can upload it via the left sidebar "Files" or use Google Drive)
zip_path = "/content/Dataset.zip"
extract_path = "/content/dataset_extracted"  # shorter path for Colab

# Make sure the extraction folder exists
os.makedirs(extract_path, exist_ok=True)

# Extract the ZIP
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"✅ ZIP extraction complete! Files are in: {extract_path}")


✅ ZIP extraction complete! Files are in: /content/dataset_extracted


In [4]:
import os
import json
import re
from pathlib import Path
from tqdm import tqdm
from pdfminer.high_level import extract_text

# ===== Path to your dataset in Colab =====
DATASET_PATH = Path("/content/dataset_extracted/Dataset")  # update if needed

if not DATASET_PATH.exists():
    raise FileNotFoundError(f"Dataset path not found: {DATASET_PATH}")

author_texts = {}

def clean_text(text):
    """Clean PDF text: remove unwanted characters, extra whitespace, lowercase."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,:;!?()\-\'"\s]', '', text)
    return text.strip().lower()

# ===== Recursive scan for PDFs =====
all_pdf_files = list(DATASET_PATH.rglob("*.pdf"))
print(f"Found {len(all_pdf_files)} PDF files in dataset.")

for pdf_file in tqdm(all_pdf_files, desc="Processing PDFs"):
    author_name = pdf_file.parent.name
    try:
        # Extract text using PDFMiner
        text = extract_text(str(pdf_file))
        clean = clean_text(text)
        if len(clean) > 200:  # skip very short PDFs
            author_texts.setdefault(author_name, []).append(clean)
    except Exception as e:
        print(f"⚠️ Error reading {pdf_file}: {e}")

# ===== Save JSON in Colab workspace =====
output_path = "/content/author_texts_pdfminer.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(author_texts, f, ensure_ascii=False, indent=2)

print(f"✅ Extraction complete! Total authors processed: {len(author_texts)}")
print(f"JSON saved at: {output_path}")


Found 637 PDF files in dataset.


Processing PDFs:  24%|██▍       | 152/637 [08:07<17:18,  2.14s/it]

⚠️ Error reading /content/dataset_extracted/Dataset/Dr. Shikha Mehta/Nature-Inspired Algorithms.pdf: ('Unhandled', 14)


Processing PDFs: 100%|██████████| 637/637 [29:15<00:00,  2.76s/it]


✅ Extraction complete! Total authors processed: 71
JSON saved at: /content/author_texts_pdfminer.json


In [5]:
import json
from pathlib import Path
from pdfminer.high_level import extract_text
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# ===== Parameters =====
TOP_K = 5  # number of reviewers to suggest
AUTHOR_TEXTS_JSON = "/content/author_texts_pdfminer.json"  # output from previous step
INPUT_PAPER_PATH = "/content/A Review of Clustering Techniques.pdf"  # path to the paper to review

# ===== Helper function =====
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,:;!?()\-\'"\s]', '', text)
    return text.strip().lower()

# ===== 1. Load author texts =====
with open(AUTHOR_TEXTS_JSON, "r", encoding="utf-8") as f:
    author_texts = json.load(f)

# Combine all texts per author into a single string
author_corpus = {a: " ".join(texts) for a, texts in author_texts.items()}

# ===== 2. Extract input paper text =====
input_text_raw = extract_text(INPUT_PAPER_PATH)
input_text = clean_text(input_text_raw)

# ===== 3. Prepare corpus for vectorization =====
author_names = list(author_corpus.keys())
corpus = [author_corpus[a] for a in author_names]

# ===== 4. TF-IDF Vectorization =====
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
author_vectors = vectorizer.fit_transform(corpus)
input_vector = vectorizer.transform([input_text])

# ===== 5. Cosine similarity =====
similarity_scores = cosine_similarity(input_vector, author_vectors).flatten()

# ===== 6. Top-k authors =====
top_indices = similarity_scores.argsort()[::-1][:TOP_K]
print("🏆 Top-k reviewers for the paper:")
for rank, idx in enumerate(top_indices, 1):
    print(f"{rank}. {author_names[idx]} (Score: {similarity_scores[idx]:.4f})")


🏆 Top-k reviewers for the paper:
1. Om Prakash Patel (Score: 0.5780)
2. Himanshu Mittal (Score: 0.5193)
3. Aruna Tiwari (Score: 0.5076)
4. Ramalinga Swamy Cheruku (Score: 0.4893)
5. Dr. Shikha Mehta (Score: 0.4761)


In [6]:
# ===========================
# Reviewer Recommendation using BERT Embeddings
# ===========================

!pip install -q sentence-transformers tqdm

import os
import json
import re
from pathlib import Path
from tqdm import tqdm
from pdfminer.high_level import extract_text
from sentence_transformers import SentenceTransformer, util

# ---------------------------
# 1. Load Author Corpus
# ---------------------------
DATASET_PATH = Path("/content/dataset_extracted/Dataset")

author_texts = {}

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,:;!?()\-\'"\s]', '', text)
    return text.strip().lower()

# Recursively scan PDFs
all_pdf_files = list(DATASET_PATH.rglob("*.pdf"))
print(f"Found {len(all_pdf_files)} PDF files in dataset.")

for pdf_file in tqdm(all_pdf_files, desc="Processing PDFs"):
    author_name = pdf_file.parent.name
    try:
        text = extract_text(str(pdf_file))
        clean = clean_text(text)
        if len(clean) > 200:  # skip very short PDFs
            author_texts.setdefault(author_name, []).append(clean)
    except Exception as e:
        print(f"⚠️ Error reading {pdf_file}: {e}")

# ---------------------------
# 2. Load Input Paper
# ---------------------------
INPUT_PAPER_PATH = "/content/A Review of Clustering Techniques.pdf"  # replace with uploaded file path
input_text = clean_text(extract_text(INPUT_PAPER_PATH))

# ---------------------------
# 3. Encode texts using BERT
# ---------------------------
print("Encoding texts using BERT embeddings...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # fast & accurate

# Encode each author's combined texts
author_embeddings = {}
for author, texts in tqdm(author_texts.items(), desc="Encoding authors"):
    combined_text = " ".join(texts)
    author_embeddings[author] = model.encode(combined_text, convert_to_tensor=True)

# Encode input paper
input_embedding = model.encode(input_text, convert_to_tensor=True)

# ---------------------------
# 4. Compute Similarity
# ---------------------------
similarities = {}
for author, emb in author_embeddings.items():
    sim = util.cos_sim(input_embedding, emb).item()
    similarities[author] = sim

# ---------------------------
# 5. Get Top-K Authors
# ---------------------------
k = 5
top_k_authors = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:k]

print(f"✅ Top-{k} recommended reviewers:")
for rank, (author, score) in enumerate(top_k_authors, start=1):
    print(f"{rank}. {author} (Score: {score:.4f})")

# ---------------------------
# 6. Save Recommendations
# ---------------------------
output_file = "/content/top_k_authors.json"
with open(output_file, "w") as f:
    json.dump(top_k_authors, f, indent=2)

print(f"Recommendations saved to {output_file}")


Found 637 PDF files in dataset.


Processing PDFs:  24%|██▍       | 152/637 [08:08<17:36,  2.18s/it]

⚠️ Error reading /content/dataset_extracted/Dataset/Dr. Shikha Mehta/Nature-Inspired Algorithms.pdf: ('Unhandled', 14)


Processing PDFs: 100%|██████████| 637/637 [29:38<00:00,  2.79s/it]


Encoding texts using BERT embeddings...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding authors: 100%|██████████| 71/71 [00:48<00:00,  1.46it/s]


✅ Top-5 recommended reviewers:
1. Himanshu Mittal (Score: 0.4966)
2. V. Ravi (Score: 0.4935)
3. Tandra Pal (Score: 0.4796)
4. Shikha Gupta (Score: 0.4602)
5. Ramalinga Swamy Cheruku (Score: 0.4313)
Recommendations saved to /content/top_k_authors.json
