In [3]:
pip install pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [4]:
import json
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import pandas as pd

# Download NLTK data files (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')


def preprocess(text):
    """
    Preprocess the input text by tokenizing, removing non-alphanumeric tokens, 
    converting to lowercase, removing stopwords, and stemming.
    """
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove non-alphanumeric and convert to lowercase
    tokens = [re.sub(r'\W+', '', token).lower() for token in tokens if token.isalnum()]
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

# Read and preprocess documents
corpus = {}
try:
    with open("corpus.jsonl", "r") as f:
        for line in f:
            # Parse the JSON line
            doc = json.loads(line)
            
            # Extract document ID and text (instead of abstract)
            if '_id' in doc and 'text' in doc:
                doc_id = doc['_id']  # Use '_id' as the document identifier
                text = doc['text']  # Use 'text' as the content to preprocess
                
                # Preprocess and store the result
                corpus[doc_id] = preprocess(text)
            else:
                print(f"Skipping document due to missing keys: {doc}")
except FileNotFoundError:
    print("Error: The file 'corpus.jsonl' was not found.")
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")

# Convert the processed corpus into a list of dictionaries for easy saving into an Excel file
data = []
for doc_id, tokens in corpus.items():
    data.append({"Document ID": doc_id, "Tokens": " ".join(tokens)})

# Create a DataFrame
df = pd.DataFrame(data)

# Save to Excel
excel_filename = "preprocessed_corpus.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Preprocessed data has been saved to {excel_filename}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prono\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prono\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed data has been saved to preprocessed_corpus.xlsx
