In [None]:
#lexion keyword matching
#Method: Extract keywords from each author’s papers and the input paper, then compute overlap.
#Pros: Fast, interpretable, easy to implement.
#Cons: Sensitive to vocabulary mismatch, ignores context or semantic meaning.
#Use case: Works if papers have a very specific jargon or limited domain.
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the JSON file
with open("author_texts_pdfminer.json", "r", encoding="utf-8") as f:
    authors_json = json.load(f)

# Example query
query_text = "your search query here"

# Join all papers of each author into a single string
all_texts = [" ".join(author_papers) for author_papers in authors_json.values()]

# Create TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(all_texts)
query_vec = vectorizer.transform([query_text])

# Compute cosine similarity
cosine_sim = cosine_similarity(query_vec, X)

# Print similarity scores
for author, score in zip(authors_json.keys(), cosine_sim[0]):
    print(f"{author}: {score:.4f}")


Amit Saxena: 0.0022
Amita Jain: 0.0000
Animesh Chaturvedi: 0.0051
Ankita Jain: 0.0098
Arun Chauhan: 0.0045
Aruna Malapati: 0.0078
Aruna Tiwari: 0.0058
Barsha Mitra: 0.0008
Bhanukiran Perabathini: 0.0053
Bharghava Rajaram: 0.0010
Deepak K T: 0.0224
Devendra K Tayal: 0.0484
Dilip Singh Sisodia: 0.0219
dipanjan roy: 0.0016
Dipti Mishra: 0.0009
Dr. Ashish Jain: 0.0323
Dr. Shikha Mehta: 0.0263
Dr.Manpreet Kaur: 0.0004
Dr.Rohit Beniwal: 0.0074
Dr.Ruchi Mittal: 0.0116
esha baidya kayal: 0.0000
Geeta Rani: 0.0008
Himanee Bansal: 0.0460
Himanshu Mittal: 0.0459
J. Balasubramaniam: 0.0013
Jagdish Bansal: 0.0692
Jayasri D: 0.0000
Jian Wang: 0.0010
K.V. Sambasivarao: 0.0029
Kastuv Nag: 0.0203
Khaldoon Dhou: 0.0152
Krishna Asawa: 0.1024
Mala Saraswat: 0.0256
Manju_JaypeeTech: 0.0029
Manoranjan Mohanty: 0.0244
Minni Jain: 0.0041
Mukesh Prasad: 0.0142
Navneet Pratap Singh: 0.0047
Nikhil Tripathi: 0.0062
Nishchal K. Verma: 0.0000
Om Prakash Patel: 0.0125
OmPrakash Kaiwartya: 0.0042
Pabitra Mitra: 0.006

In [None]:
#Topic Modeling (LDA / NMF)
#Method: Represent each paper in terms of topics instead of individual words.
#Pros: Captures underlying research themes, good for interdisciplinarity.
#Cons: Needs careful tuning (number of topics), may lose fine-grained differences.
#Use case: Useful when papers cover multiple areas or if you want topic-based clustering of reviewers.*/

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine all papers of each author
all_texts = [" ".join(papers) for papers in authors_json.values()]

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(all_texts)

# NMF topic modeling
nmf = NMF(n_components=20, random_state=42)
W = nmf.fit_transform(X)  # Each author represented by topic vector

# Transform the query paper
query_W = nmf.transform(tfidf.transform([query_text]))

# Compute cosine similarity
cosine_sim = cosine_similarity(query_W, W).flatten()  # flatten to 1D array

# Print author similarities
for author, score in zip(authors_json.keys(), cosine_sim):
    print(f"{author}: {score:.4f}")


Amit Saxena: 0.0181
Amita Jain: 0.0000
Animesh Chaturvedi: 0.0000
Ankita Jain: 0.3823
Arun Chauhan: 0.0000
Aruna Malapati: 0.2067
Aruna Tiwari: 0.6036
Barsha Mitra: 0.0031
Bhanukiran Perabathini: 0.0182
Bharghava Rajaram: 0.0000
Deepak K T: 0.2714
Devendra K Tayal: 0.2875
Dilip Singh Sisodia: 0.3650
dipanjan roy: 0.0094
Dipti Mishra: 0.0162
Dr. Ashish Jain: 0.7426
Dr. Shikha Mehta: 0.6932
Dr.Manpreet Kaur: 0.2117
Dr.Rohit Beniwal: 0.0000
Dr.Ruchi Mittal: 0.0891
esha baidya kayal: 0.0000
Geeta Rani: 0.0065
Himanee Bansal: 0.3044
Himanshu Mittal: 0.5837
J. Balasubramaniam: 0.0000
Jagdish Bansal: 0.7953
Jayasri D: 0.0000
Jian Wang: 0.0823
K.V. Sambasivarao: 0.4543
Kastuv Nag: 0.6941
Khaldoon Dhou: 0.0075
Krishna Asawa: 0.3509
Mala Saraswat: 0.2092
Manju_JaypeeTech: 0.0000
Manoranjan Mohanty: 0.3758
Minni Jain: 0.0030
Mukesh Prasad: 0.4196
Navneet Pratap Singh: 0.1345
Nikhil Tripathi: 0.0369
Nishchal K. Verma: 0.3153
Om Prakash Patel: 0.6893
OmPrakash Kaiwartya: 0.0369
Pabitra Mitra: 0.000



In [None]:
from sentence_transformers import SentenceTransformer, util

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight and fast

# Encode all authors' papers
author_names = list(authors_json.keys())
author_embeddings = [model.encode(" ".join(authors_json[author]), convert_to_tensor=True) 
                     for author in author_names]

# Encode query paper
query_embedding = model.encode(query_text, convert_to_tensor=True)

# Compute cosine similarity scores
cos_scores = [util.cos_sim(query_embedding, emb).item() for emb in author_embeddings]

# Print authors with their similarity scores
for author, score in zip(author_names, cos_scores):
    print(f"{author}: {score:.4f}")

# Optional: Top-k authors
k = 5
top_indices = sorted(range(len(cos_scores)), key=lambda i: cos_scores[i], reverse=True)[:k]
print("\nTop-k recommended reviewers:")
for i in top_indices:
    print(f"{author_names[i]}: {cos_scores[i]:.4f}")
