In [7]:
import json
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1️⃣ Load the JSON file
json_path = r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\assignment_2\extracted_data\author_texts_pdfminer.json"
with open(json_path, 'r', encoding='utf-8') as f:
    authors_texts_pdfminer = json.load(f)

# 2️⃣ Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

# 3️⃣ Extract top keywords from a single document
def extract_top_keywords(doc_text, top_n=50):
    vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = vectorizer.fit_transform([doc_text])
    feature_names = vectorizer.get_feature_names_out()
    sorted_nzs = np.argsort(tfidf_matrix.toarray()[0])[::-1][:top_n]
    keywords = [feature_names[i] for i in sorted_nzs]
    return keywords

# 4️⃣ Process all authors and papers
authors_keywords = {}
for author, papers in authors_texts_pdfminer.items():
    paper_keywords_list = []
    for paper in papers:
        clean_text = preprocess_text(paper)
        keywords = extract_top_keywords(clean_text, top_n=50)
        paper_keywords_list.append(keywords)
    authors_keywords[author] = paper_keywords_list

# 5️⃣ Save to JSON
output_path = r"C:\Users\BHUVANA VIJAYA\OneDrive\Documents\assignment_2\extracted_data\authors_keywords.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(authors_keywords, f, indent=4)

print("✅ Keywords extraction complete. Saved to:", output_path)


[nltk_data] Downloading package stopwords to C:\Users\BHUVANA
[nltk_data]     VIJAYA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Keywords extraction complete. Saved to: C:\Users\BHUVANA VIJAYA\OneDrive\Documents\assignment_2\extracted_data\authors_keywords.json
