In [1]:
import json
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK data files (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')


def preprocess(text):
    """
    Preprocess the input text by tokenizing, removing non-alphanumeric tokens, 
    converting to lowercase, removing stopwords, and stemming.
    """
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove non-alphanumeric and convert to lowercase
    tokens = [re.sub(r'\W+', '', token).lower() for token in tokens if token.isalnum()]
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

# Read and preprocess documents
corpus = {}
try:
    with open("corpus.jsonl", "r") as f:
        for line in f:
            # Parse the JSON line
            doc = json.loads(line)
            
            # Extract document ID and abstract
            if 'doc_id' in doc and 'abstract' in doc:
                doc_id = doc['doc_id']
                abstract = doc['abstract']
                
                # Preprocess and store the result
                corpus[doc_id] = preprocess(abstract)
            else:
                print(f"Skipping document due to missing keys: {doc}")
except FileNotFoundError:
    print("Error: The file 'corpus.jsonl' was not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")

# Debugging: Print a sample of the preprocessed corpus
for doc_id, tokens in list(corpus.items())[:5]:  # Display the first 5 entries
    print(f"Document ID: {doc_id}\nTokens: {tokens}\n")

Error: The file 'corpus.jsonl' was not found.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prono\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prono\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
