In [None]:
#text extract
import json
import re
from pathlib import Path
from pdfminer.high_level import extract_text

# === Custom Input ===
pdf_path = input("Enter the path to your PDF file: ").strip()

pdf_file = Path(pdf_path)
if not pdf_file.exists():
    raise FileNotFoundError(f"❌ PDF not found at: {pdf_file}")

# === Clean text function ===
def clean_text(text):
    """Cleans text by removing unwanted characters and normalizing whitespace."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,:;!?()\-\'"\s]', '', text)
    return text.strip().lower()

# === Extract and clean text ===
print(f"📘 Extracting text from: {pdf_file.name}")
try:
    raw_text = extract_text(str(pdf_file))
    cleaned_text = clean_text(raw_text)
    if len(cleaned_text) < 50:
        print("⚠️ Warning: extracted text is too short, may be scanned or image-based PDF.")
except Exception as e:
    print(f"❌ Error extracting text: {e}")
    cleaned_text = ""

# === Save to JSON ===
output_json = {
    "filename": pdf_file.name,
    "text": cleaned_text
}

output_path = pdf_file.parent / "input_text.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(output_json, f, ensure_ascii=False, indent=2)

print(f"✅ Extraction complete! Saved to: {output_path}")


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


📘 Extracting text from: 1706.03762v7.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


✅ Extraction complete! Saved to: C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\input_text.json


In [4]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path

# === Paths ===
input_json_path = Path(r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\input_text.json")
authors_json_path = Path(r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\extracted_data\author_texts_pdfminer.json")

# === Load input text ===
with open(input_json_path, "r", encoding="utf-8") as f:
    input_data = json.load(f)
input_text = input_data.get("text", "").strip()

if not input_text:
    raise ValueError("❌ No text found in input_text.json")

# === Load authors dataset ===
with open(authors_json_path, "r", encoding="utf-8") as f:
    authors_json = json.load(f)

if not authors_json:
    raise ValueError("❌ author_texts_pdfminer.json is empty or invalid.")

# === Combine all papers per author ===
author_names = list(authors_json.keys())
all_texts = [" ".join(author_papers) for author_papers in authors_json.values()]

# === TF-IDF Vectorization ===
print("🔍 Creating TF-IDF vectors...")
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10000)
X = vectorizer.fit_transform(all_texts)
query_vec = vectorizer.transform([input_text])

# === Compute cosine similarity ===
cosine_sim = cosine_similarity(query_vec, X)[0]

# === Find highest similarity ===
max_score = cosine_sim.max()
best_indices = [i for i, s in enumerate(cosine_sim) if s == max_score]

print("\n📊 Cosine Similarity Scores:")
for author, score in zip(author_names, cosine_sim):
    print(f"{author}: {score:.4f}")

print("\n🏆 Author(s) with Highest Similarity:")
for idx in best_indices:
    print(f"➡️ {author_names[idx]} (Score: {cosine_sim[idx]:.4f})")

print("\n✅ Similarity comparison complete!")


🔍 Creating TF-IDF vectors...

📊 Cosine Similarity Scores:
Amit Saxena: 0.0709
Amita Jain: 0.0729
Animesh Chaturvedi: 0.1495
Ankita Jain: 0.1002
Arun Chauhan: 0.2568
Aruna Malapati: 0.1819
Aruna Tiwari: 0.1594
Barsha Mitra: 0.0832
Bhanukiran Perabathini: 0.0825
Bharghava Rajaram: 0.0632
Deepak K T: 0.1452
Devendra K Tayal: 0.1618
Dilip Singh Sisodia: 0.1679
dipanjan roy: 0.0699
Dipti Mishra: 0.1860
Dr. Ashish Jain: 0.1624
Dr. Shikha Mehta: 0.1336
Dr.Manpreet Kaur: 0.1284
Dr.Rohit Beniwal: 0.1068
Dr.Ruchi Mittal: 0.1270
esha baidya kayal: 0.0775
Geeta Rani: 0.2512
Himanee Bansal: 0.1106
Himanshu Mittal: 0.1361
J. Balasubramaniam: 0.0489
Jagdish Bansal: 0.0996
Jayasri D: 0.0437
Jian Wang: 0.1429
K.V. Sambasivarao: 0.1149
Kastuv Nag: 0.1228
Khaldoon Dhou: 0.0652
Krishna Asawa: 0.1659
Mala Saraswat: 0.1570
Manju_JaypeeTech: 0.0612
Manoranjan Mohanty: 0.1327
Minni Jain: 0.2511
Mukesh Prasad: 0.1595
Navneet Pratap Singh: 0.1170
Nikhil Tripathi: 0.0854
Nishchal K. Verma: 0.1199
Om Prakash Pate

In [9]:
import fitz  # PyMuPDF
import re
import os
import json

def extract_references(pdf_path, save_to_json=True):
    """Extract and clean the References section from a PDF."""
    if not os.path.exists(pdf_path):
        print(f"❌ File not found: {pdf_path}")
        return None

    doc = fitz.open(pdf_path)
    full_text = ""

    # --- 1️⃣ Extract text from all pages ---
    for page in doc:
        text = page.get_text("text")
        full_text += text + "\n"

    # Normalize whitespace
    full_text = re.sub(r'\s+', ' ', full_text).strip()

    # --- 2️⃣ Locate 'References' section ---
    section_pattern = re.compile(
        r'(references|bibliography|reference list|references and notes)[:\s\-]*',
        re.IGNORECASE
    )
    start_match = section_pattern.search(full_text)

    if not start_match:
        print("⚠️ No 'References' section found in the PDF.")
        return None

    start_idx = start_match.end()
    references_text = full_text[start_idx:].strip()

    # --- 3️⃣ Stop before next section (Appendix, Acknowledgements, etc.) ---
    end_match = re.search(
        r'(appendix|acknowledg(e)?ments?|supplementary materials?)',
        references_text,
        re.IGNORECASE
    )
    if end_match:
        references_text = references_text[:end_match.start()].strip()

    # --- 4️⃣ Split into individual reference entries ---
    entries = re.split(r'\s*(?:\[\d+\]|\d+\.\s+|•\s+)\s*', references_text)
    entries = [e.strip() for e in entries if len(e.strip()) > 20]

    # --- 5️⃣ Save clean JSON (no ids, no text objects) ---
    if save_to_json:
        output_file = "input_references.json"
        data = {
            "pdf_file": os.path.basename(pdf_path),
            "reference_count": len(entries),
            "references": entries
        }

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

        print(f"✅ References extracted and saved to '{output_file}'")

    return entries


# --- 🔹 MAIN EXECUTION ---
if __name__ == "__main__":
    print("📘 PDF Reference Extractor → JSON (Clean Format)")
    pdf_path = input("Enter the path to your PDF file: ").strip().strip('"')

    references = extract_references(pdf_path)

    if references:
        print(f"\n✅ Found {len(references)} reference entries.")
        print("📄 First few references:\n")
        for i, ref in enumerate(references[:5], 1):
            print(f"{i}. {ref[:250]}...")


📘 PDF Reference Extractor → JSON (Clean Format)
✅ References extracted and saved to 'input_references.json'

✅ Found 46 reference entries.
📄 First few references:

1. Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint arXiv:1607.06450,...
2. Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly learning to align and translate. CoRR, abs/1409.0473,...
3. Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural machine translation architectures. CoRR, abs/1703.03906,...
4. Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine reading. arXiv preprint arXiv:1601.06733,...
5. Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk, and Yoshua Bengio. Learning phrase representations using rnn encoder-decoder for statistical machine translation. CoRR, abs/1406.1078,...


In [None]:
#find similarity between references is pending......

In [10]:
import json
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
from pathlib import Path

# Download stopwords (only needed once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# === Paths ===
input_json_path = Path(r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\input_text.json")
output_json_path = input_json_path.parent / "input_keywords.json"

# === Load input text ===
with open(input_json_path, 'r', encoding='utf-8') as f:
    input_data = json.load(f)
input_text = input_data.get("text", "").strip()

if not input_text:
    raise ValueError("❌ No text found in input_text.json")

# === Preprocessing function ===
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

# === Keyword extraction function ===
def extract_top_keywords(doc_text, top_n=50):
    vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = vectorizer.fit_transform([doc_text])
    feature_names = vectorizer.get_feature_names_out()
    sorted_nzs = np.argsort(tfidf_matrix.toarray()[0])[::-1][:top_n]
    keywords = [feature_names[i] for i in sorted_nzs]
    return keywords

# === Process the input text ===
clean_text = preprocess_text(input_text)
keywords = extract_top_keywords(clean_text, top_n=50)

# === Save to JSON ===
output_data = {
    "filename": input_data.get("filename", "unknown.pdf"),
    "top_keywords": keywords
}

with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=4)

print(f"✅ Keywords extraction complete for {input_data.get('filename', 'input file')}")
print(f"📁 Saved to: {output_json_path}")
print(f"🔑 Top 10 Keywords: {keywords[:10]}")


✅ Keywords extraction complete for 1706.03762v7.pdf
📁 Saved to: C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\input_keywords.json
🔑 Top 10 Keywords: ['attention', 'model', 'models', 'sequence', 'arxiv', 'output', 'layer', 'transformer', 'neural', 'selfattention']


[nltk_data] Downloading package stopwords to C:\Users\BHUVANA
[nltk_data]     VIJAYA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
import json
import os

# === Load input paper keywords ===
input_keywords_path = r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\input_keywords.json"
with open(input_keywords_path, 'r', encoding='utf-8') as f:
    input_keywords = set(json.load(f)["top_keywords"])

# === Load the author keywords dataset ===
author_keywords_path = r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\extracted_data\authors_keywords.json"
with open(author_keywords_path, 'r', encoding='utf-8') as f:
    author_data = json.load(f)

# === Define Jaccard similarity ===
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union else 0

# === Compute average Jaccard similarity per author ===
author_scores = {}

for author, papers_keywords in author_data.items():
    if not papers_keywords:
        continue

    paper_scores = []
    for keywords in papers_keywords:
        if not keywords:
            continue
        score = jaccard_similarity(input_keywords, set(keywords))
        paper_scores.append(score)

    if paper_scores:
        author_scores[author] = sum(paper_scores) / len(paper_scores)

# === Sort authors by similarity score ===
sorted_authors = sorted(author_scores.items(), key=lambda x: x[1], reverse=True)

# === Display top authors ===
print("🏆 Top Matching Authors by Keyword (Jaccard) Similarity:")
for author, score in sorted_authors[:10]:
    print(f"{author}: {score:.4f}")

# === (Optional) Save results ===
output_path = r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\keyword_similarity_results.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(sorted_authors, f, indent=4)

print(f"\n📁 Similarity results saved to: {output_path}")


🏆 Top Matching Authors by Keyword (Jaccard) Similarity:
Arun Chauhan: 0.1640
Ramalinga Swamy Cheruku: 0.1254
Minni Jain: 0.1110
Jian Wang: 0.1031
Payal Khurana Batra: 0.1010
Shikha Gupta: 0.1003
Deepak K T: 0.0994
Om Prakash Patel: 0.0956
Aruna Tiwari: 0.0904
Dipti Mishra: 0.0901

📁 Similarity results saved to: C:\Users\BHUVANA VIJAYA\OneDrive\Documents\GitHub\assignment_2_AI\keyword_similarity_results.json
